├── src
    ├── __init__.py
    ├── load_data.py
    ├── tweetstorm.py
    ├── tweetokenizer.py
    ├── vader_sentiment.py
    ├── part_of_speech.py
    ├── text_emotion.py
    ├── time_of_day.py
    ├── ridge_grid_scan.py
    ├── feature_pipeline.py
    └── style.py
├── images
    ├── 1.png
    ├── 2.png
    ├── 3.png
    ├── 4.png
    ├── ridge.png
    ├── trump.png
    ├── neutral.png
    ├── trump2.png
    ├── not_trump.png
    ├── ridge_sns.png
    ├── flynn_tweet.png
    ├── oval_office.jpg
    ├── trump_blank.png
    ├── trump_robot.jpg
    ├── trump_ticker.gif
    ├── trump_clapping.jpg
    ├── trump_thumbs_up.jpg
    ├── flynn_tweet_poll.png
    ├── trump_clapping_lg.jpg
    └── trump_tweet_birds.gif
├── requirements.txt
├── .gitignore
├── penn_part_of_speech_tags.txt
├── twitterbot.py
├── twitterbot_rf.py
├── twitterbot_knn.py
├── twitterbot_mini_ensemble.py
├── README.md
└── TweetAuthorshipPredictor.py


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/images/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/1.png


--------------------------------------------------------------------------------
/images/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/2.png


--------------------------------------------------------------------------------
/images/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/3.png


--------------------------------------------------------------------------------
/images/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/4.png


--------------------------------------------------------------------------------
/images/ridge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/ridge.png


--------------------------------------------------------------------------------
/images/trump.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump.png


--------------------------------------------------------------------------------
/images/neutral.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/neutral.png


--------------------------------------------------------------------------------
/images/trump2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump2.png


--------------------------------------------------------------------------------
/images/not_trump.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/not_trump.png


--------------------------------------------------------------------------------
/images/ridge_sns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/ridge_sns.png


--------------------------------------------------------------------------------
/images/flynn_tweet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/flynn_tweet.png


--------------------------------------------------------------------------------
/images/oval_office.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/oval_office.jpg


--------------------------------------------------------------------------------
/images/trump_blank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_blank.png


--------------------------------------------------------------------------------
/images/trump_robot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_robot.jpg


--------------------------------------------------------------------------------
/images/trump_ticker.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_ticker.gif


--------------------------------------------------------------------------------
/images/trump_clapping.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_clapping.jpg


--------------------------------------------------------------------------------
/images/trump_thumbs_up.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_thumbs_up.jpg


--------------------------------------------------------------------------------
/images/flynn_tweet_poll.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/flynn_tweet_poll.png


--------------------------------------------------------------------------------
/images/trump_clapping_lg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_clapping_lg.jpg


--------------------------------------------------------------------------------
/images/trump_tweet_birds.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raffg/trump-tweet-author-identification/HEAD/images/trump_tweet_birds.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib==2.1.0
 2 | tweepy==3.5.0
 3 | nltk==3.2.1
 4 | requests==2.18.4
 5 | pandas==0.20.3
 6 | numpy==1.13.3
 7 | Pillow==5.0.0
 8 | beautifulsoup4==4.6.0
 9 | secrets==1.0.2
10 | scikit_learn==0.19.1
11 | vaderSentiment==2.5
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Raw data
 2 | data/
 3 | 
 4 | # .env
 5 | .env/
 6 | 
 7 | # Jupyter Notebooks
 8 | *.ipynb_checkpoints
 9 | *.ipynb
10 | 
11 | # .pyc files
12 | *.pyc
13 | 
14 | # twitterscraper query
15 | twitterscraper_query.txt
16 | 
17 | # Stanford Named Entity Recognition (NER) library
18 | stanford-ner/
19 | 
20 | # NRC Sentiment-Emotion Lexicons
21 | NRC-Sentiment-Emotion-Lexicons/
22 | 
23 | # pickle files
24 | *.pkl
25 | 
26 | # .npy and .npz files
27 | *.npy
28 | *.npz
29 | 
30 | # presentation folder
31 | presentation/
32 | 
33 | # ensemble pickles
34 | ensemble/
35 | 


--------------------------------------------------------------------------------
/src/load_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def load_json_list(list):
 5 |     '''
 6 |     Takes a list of json files, loads them, and concatenates them
 7 |     INPUT: a list of json files
 8 |     OUTPUT: a single concatenated DataFrame
 9 |     '''
10 | 
11 |     files = []
12 |     for file in list:
13 |         df = pd.read_json(file)
14 |         files.append(df)
15 |     return pd.concat(files)
16 | 
17 | 
18 | def apply_date_mask(df, date_column, start_date, end_date):
19 |     '''
20 |     applies mask to a df to include only dates within the given date range
21 |     INPUT: a DataFrame, the name of the datetime column, start and end dates
22 |     OUTPUT: a DataFrame with a datetime index, sorted by datetime
23 |     '''
24 | 
25 |     mask = (df[date_column] > start_date) & (df[date_column] <= end_date)
26 |     return df.loc[mask]
27 | 


--------------------------------------------------------------------------------
/src/tweetstorm.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def tweetstorm(df, tweet, source, timestamp, time_threshold):
 5 |     '''
 6 |     Takes a DataFrame with a specified column containing tweets, a specified
 7 |     column identifying the source of the tweet, a specified column indicating
 8 |     the timestamp of the tweet, and a threshold in seconds defining the
 9 |     maximimum time which can pass between tweets to define a tweetstorm
10 |     INPUT: DataFrame, string, string, string, int
11 |     OUTPUT: the original DataFrame with one new column
12 |     '''
13 | 
14 |     temp = pd.DataFrame()
15 |     df = df.copy()
16 |     temp['time_diff'] = df.groupby(source)[timestamp].diff().dt.total_seconds()
17 |     temp['time_diff_prev'] = temp['time_diff'].shift(-1)
18 |     df['tweetstorm'] = temp.eval('time_diff < @time_threshold | \
19 |                                  time_diff_prev < @time_threshold')
20 |     return df
21 | 


--------------------------------------------------------------------------------
/src/tweetokenizer.py:
--------------------------------------------------------------------------------
 1 | from tweetokenize import Tokenizer
 2 | 
 3 | 
 4 | def tweet_tokens(tweet):
 5 |     '''
 6 |     Takes a tweet and replaces mentions, hashtags, urls, times, and numbers
 7 |     with a generic label
 8 |     INPUT: string
 9 |     OUTPUT: string
10 |     '''
11 | 
12 |     gettokens = Tokenizer(usernames='USER', urls='URL',
13 |                           hashtags='HASHTAG', times='TIME',
14 |                           numbers='NUMBER', allcapskeep=True,
15 |                           lowercase=False)
16 |     tokens = gettokens.tokenize(tweet)
17 |     tweet = ' '.join(tokens)
18 | 
19 |     return tweet
20 | 
21 | 
22 | def tweet_tokenize(df, column):
23 |     '''
24 |     Takes a Data Frame and a specified column of tweets and creates a new
25 |     column with the tweetokenized tweet
26 |     INPUT: DateFrame, string
27 |     OUTPUT: the original DataFrame with one new column
28 |     '''
29 | 
30 |     new_df = df.copy()
31 |     new_df['tweetokenize'] = new_df['text'].apply(tweet_tokens)
32 |     return new_df
33 | 


--------------------------------------------------------------------------------
/penn_part_of_speech_tags.txt:
--------------------------------------------------------------------------------
 1 | 1.	CC	Coordinating conjunction
 2 | 2.	CD	Cardinal number
 3 | 3.	DT	Determiner
 4 | 4.	EX	Existential there
 5 | 5.	FW	Foreign word
 6 | 6.	IN	Preposition or subordinating conjunction
 7 | 7.	JJ	Adjective
 8 | 8.	JJR	Adjective, comparative
 9 | 9.	JJS	Adjective, superlative
10 | 10.	LS	List item marker
11 | 11.	MD	Modal
12 | 12.	NN	Noun, singular or mass
13 | 13.	NNS	Noun, plural
14 | 14.	NNP	Proper noun, singular
15 | 15.	NNPS	Proper noun, plural
16 | 16.	PDT	Predeterminer
17 | 17.	POS	Possessive ending
18 | 18.	PRP	Personal pronoun
19 | 19.	PRP$	Possessive pronoun
20 | 20.	RB	Adverb
21 | 21.	RBR	Adverb, comparative
22 | 22.	RBS	Adverb, superlative
23 | 23.	RP	Particle
24 | 24.	SYM	Symbol
25 | 25.	TO	to
26 | 26.	UH	Interjection
27 | 27.	VB	Verb, base form
28 | 28.	VBD	Verb, past tense
29 | 29.	VBG	Verb, gerund or present participle
30 | 30.	VBN	Verb, past participle
31 | 31.	VBP	Verb, non-3rd person singular present
32 | 32.	VBZ	Verb, 3rd person singular present
33 | 33.	WDT	Wh-determiner
34 | 34.	WP	Wh-pronoun
35 | 35.	WP$	Possessive wh-pronoun
36 | 36.	WRB	Wh-adverb
37 | 


--------------------------------------------------------------------------------
/src/vader_sentiment.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 3 | from sklearn.preprocessing import normalize
 4 | from sklearn import preprocessing
 5 | 
 6 | 
 7 | def get_vader_scores(text):
 8 |     '''
 9 |     Takes a string of text and outputs four values for Vader's negative,
10 |     neutral, positive, and compound (normalized) sentiment scores
11 |     INPUT: a string
12 |     OUTPUT: a dictionary of four sentiment scores
13 |     '''
14 | 
15 |     analyser = SentimentIntensityAnalyzer()
16 |     return analyser.polarity_scores(text)
17 | 
18 | 
19 | def apply_vader(df, column):
20 |     '''
21 |     Takes a DataFrame with a specified column of text and adds four new columns
22 |     to the DataFrame, corresponding to the Vader sentiment scores
23 |     INPUT: DataFrame, string
24 |     OUTPUT: the original DataFrame with four additional columns
25 |     '''
26 | 
27 |     sentiment = pd.DataFrame(df[column].apply(get_vader_scores))
28 |     unpacked = pd.DataFrame([d for idx, d in sentiment[column].iteritems()],
29 |                             index=sentiment.index)
30 |     unpacked['compound'] += 1
31 |     columns = {'neu': 'v_neutral', 'pos': 'v_positive', 'neg': 'v_negative'}
32 |     unpacked.rename(columns=columns, inplace=True)
33 |     return pd.concat([df, unpacked], axis=1)
34 | 


--------------------------------------------------------------------------------
/src/part_of_speech.py:
--------------------------------------------------------------------------------
 1 | from nltk import word_tokenize, pos_tag
 2 | from nltk.tag import StanfordNERTagger
 3 | 
 4 | 
 5 | def pos_tagging(text):
 6 |     '''
 7 |     Takes a string of words and returns a string with parts-of-speech of words
 8 |     INPUT: string
 9 |     OUTPUT: string
10 |     '''
11 |     pos = pos_tag(word_tokenize(text))
12 |     string = ""
13 |     for item in pos:
14 |         string += item[1] + " "
15 |     return string
16 | 
17 | 
18 | def ner_tagging(text):
19 |     '''
20 |     Takes a tweetokenized string of words and uses the Stanford NER Tagger to
21 |     replace names, places, and organizations with a standard token
22 |     INPUT: string
23 |     OUTPUT: string
24 |     '''
25 |     st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.'
26 |                            'distsim.crf.ser.gz', 'stanford-ner/stanford-ner.'
27 |                            'jar', encoding='utf-8')
28 |     ner = st.tag(word_tokenize(text))
29 |     string = ""
30 |     for item in ner:
31 |         if item[1] == 'O':
32 |             if item[0] == '<' or item[0] == '@':
33 |                 string += item[0]
34 |             elif item[0] == '>':
35 |                     string = string[:-1] + item[0] + ' '
36 |             else:
37 |                 string += item[0] + ' '
38 |         else:
39 |             string += item[1] + ' '
40 |     tweet = ''
41 |     for word in string.split():
42 |         if word.isupper():
43 |             tweet += word + ' '
44 |         else:
45 |             tweet += word.lower() + ' '
46 |     return tweet
47 | 


--------------------------------------------------------------------------------
/src/text_emotion.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from nltk import word_tokenize
 3 | from nltk.stem.snowball import SnowballStemmer
 4 | 
 5 | 
 6 | def text_emotion(df, column):
 7 |     '''
 8 |     Takes a DataFrame and a specified column of text and adds 10 columns to the
 9 |     DataFrame for each of the 10 emotions in the NRC Emotion Lexicon, with each
10 |     column containing the value of the text in that emotions
11 |     INPUT: DataFrame, string
12 |     OUTPUT: the original DataFrame with ten new columns
13 |     '''
14 | 
15 |     new_df = df.copy()
16 | 
17 |     filepath = ('NRC-Sentiment-Emotion-Lexicons/'
18 |                 'NRC-Emotion-Lexicon-v0.92/'
19 |                 'NRC-Emotion-Lexicon-Wordlevel-v0.92.txt')
20 |     emolex_df = pd.read_csv(filepath,
21 |                             names=["word", "emotion", "association"],
22 |                             sep='\t')
23 |     emolex_words = emolex_df.pivot(index='word',
24 |                                    columns='emotion',
25 |                                    values='association').reset_index()
26 |     emotions = emolex_words.columns.drop('word')
27 |     emo_df = pd.DataFrame(0, index=df.index, columns=emotions)
28 | 
29 |     stemmer = SnowballStemmer("english")
30 | 
31 |     for i, row in new_df.iterrows():
32 |         document = word_tokenize(new_df.loc[i][column])
33 |         for word in document:
34 |             word = stemmer.stem(word.lower())
35 |             emo_score = emolex_words[emolex_words.word == word]
36 |             if not emo_score.empty:
37 |                 for emotion in list(emotions):
38 |                     emo_df.at[i, emotion] += emo_score[emotion]
39 | 
40 |     new_df = pd.concat([new_df, emo_df], axis=1)
41 | 
42 |     return new_df
43 | 


--------------------------------------------------------------------------------
/src/time_of_day.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | 
 5 | def time_of_day(df, timestamp):
 6 |     '''
 7 |     Takes a DataFrame and a specified column containing a timestamp and creates
 8 |     a new column indicating the hour of the day
 9 |     INPUT: DataFrame, string
10 |     OUTPUT: the original DataFrame with one new column
11 |     '''
12 | 
13 |     new_df = df.copy()
14 |     new_df['hour'] = new_df[timestamp].dt.hour
15 |     return new_df
16 | 
17 | 
18 | def period_of_day(df, timestamp):
19 |     '''
20 |     Takes a DataFrame and a specified column containing a timestamp and creates
21 |     a new column indicating the period of the day in 6-hour increments
22 |     INPUT: DataFrame, string
23 |     OUTPUT: the original DataFrame with one new column
24 |     '''
25 | 
26 |     new_df = df.copy()
27 |     new_df['hour_20_02'] = np.where(((new_df['created_at'].dt.hour >= 20) |
28 |                                     (new_df['created_at'].dt.hour < 2)),
29 |                                     True, False)
30 |     new_df['hour_14_20'] = np.where(((new_df['created_at'].dt.hour >= 14) &
31 |                                     (new_df['created_at'].dt.hour < 20)),
32 |                                     True, False)
33 |     new_df['hour_08_14'] = np.where(((new_df['created_at'].dt.hour >= 8) &
34 |                                     (new_df['created_at'].dt.hour < 14)),
35 |                                     True, False)
36 |     new_df['hour_02_08'] = np.where(((new_df['created_at'].dt.hour >= 2) &
37 |                                     (new_df['created_at'].dt.hour < 8)),
38 |                                     True, False)
39 |     return new_df
40 | 
41 | 
42 | def day_of_week(df, timestamp):
43 |     '''
44 |     Takes a DataFrame and a specified column containing a timestamp and creates
45 |     a new column indicating the day of the week
46 |     INPUT: DataFrame, string
47 |     OUTPUT: the original DataFrame with one new column
48 |     '''
49 |     new_df = df.copy()
50 |     new_df['day_of_week'] = new_df[timestamp].dt.weekday
51 | 
52 |     return new_df
53 | 
54 | 
55 | def weekend(df, day_of_week):
56 |     '''
57 |     Takes a DataFrame and a specified column containing a day of the week and
58 |     creates a new column indicating if the day occurs on a weekend
59 |     INPUT: DataFrame, string
60 |     OUTPUT: the original DataFrame with one new column
61 |     '''
62 |     new_df = df.copy()
63 |     new_df['weekend'] = new_df[day_of_week].apply(lambda x: 1 if x in [5, 6] else 0)
64 | 
65 |     return new_df
66 | 


--------------------------------------------------------------------------------
/src/ridge_grid_scan.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import RidgeClassifier
 2 | 
 3 | 
 4 | def ridge_grid_scan(X_train, y_train, n=100):
 5 |     '''
 6 |     Recursively performs ridge regression to sort all features in order of
 7 |     importance and return the top n features
 8 |     INPUT: X DataFrame, y DataFrame, int
 9 |     OUTPUT: list of feature importances
10 |     '''
11 | 
12 |     scan = GridScan(X_train, y_train, n)
13 |     return scan.feature_importances
14 | 
15 | 
16 | class GridScan(object):
17 |     '''
18 |     Grid scan object to track alpha levels of ridge regression and features
19 |     which are driven out of the model at each alpha levels
20 |     '''
21 | 
22 |     def __init__(self, X_train, y_train, n):
23 |         self.X_train = X_train
24 |         self.y_train = y_train
25 |         self.n = n
26 |         alpha_min = 1e-8
27 |         alpha_max = 1e24
28 |         self.alpha_levels = {}
29 |         self.feature_importances = []
30 | 
31 |         self.ridge(alpha_min)
32 |         self.ridge(alpha_max)
33 |         while len(self.alpha_levels[alpha_max]) < len(self.X_train.columns):
34 |             print('alpha too low; increasing value')
35 |             alpha_max *= 2
36 |             self.ridge(alpha_max)
37 | 
38 |         self.scan(alpha_min, alpha_max)
39 | 
40 |         self.feature_importances.sort(key=lambda feature_alpha:
41 |                                       -feature_alpha[1])
42 | 
43 |     def scan(self, lower, upper):
44 |         '''
45 |         Takes a lower and upper bound for alpha levels and recursively runs
46 |         ridge regression until only one feature is eliminated from the model
47 |         INPUT: int, int
48 |         OUTPUT:
49 |         '''
50 | 
51 |         mid = (lower + upper) / 2
52 | 
53 |         if len(self.alpha_levels[upper]) <= (len(self.X_train.columns) -
54 |                                              self.n):
55 |             return
56 | 
57 |         diff = self.alpha_levels[upper] - self.alpha_levels[lower]
58 |         if not diff:
59 |             return
60 | 
61 |         if len(diff) == 1:
62 |             for feature in diff:
63 |                 self.feature_importances.append((feature, mid))
64 |                 print('========')
65 |                 print(len(self.feature_importances), feature)
66 |                 print('{:0.1f}% complete'.format((
67 |                       len(self.feature_importances) / self.n) * 100))
68 |                 print('========')
69 |             return
70 | 
71 |         self.ridge(mid)
72 | 
73 |         self.scan(lower, mid)
74 |         self.scan(mid, upper)
75 | 
76 |     def ridge(self, alpha):
77 |         '''
78 |         Takes an alpha level and runs ridge regression
79 |         INPUT: float
80 |         OUTPUT:
81 |         '''
82 | 
83 |         print(alpha)
84 |         model = RidgeClassifier(alpha=alpha)
85 | 
86 |         model.fit(self.X_train, self.y_train)
87 | 
88 |         feat_coef = list(zip(self.X_train.columns, model.coef_[0]))
89 | 
90 |         features = set()
91 |         for element in feat_coef:
92 |             if abs(element[1]) < 1e-24:
93 |                 features.add(element[0])
94 | 
95 |         self.alpha_levels[alpha] = features
96 | 


--------------------------------------------------------------------------------
/twitterbot.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tweepy
  3 | import pandas as pd
  4 | import pickle
  5 | from time import sleep
  6 | from TweetAuthorshipPredictor import TweetAuthorshipPredictor
  7 | 
  8 | 
  9 | credentials = json.load(open('.env/twitter_credentials.json'))
 10 | 
 11 | consumer_key = credentials['consumer_key']
 12 | consumer_secret = credentials['consumer_secret']
 13 | access_token = credentials['access_token']
 14 | access_token_secret = credentials['access_token_secret']
 15 | 
 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
 17 | auth.set_access_token(access_token, access_token_secret)
 18 | api = tweepy.API(auth)
 19 | 
 20 | realDonaldTrump = '25073877'
 21 | # realDonaldTrump = '14649582'  # test
 22 | 
 23 | with open('twitterbot_pickles/trump.pkl', 'rb') as trump:
 24 |     print('Loading model...')
 25 |     model = pickle.load(trump)
 26 | 
 27 | 
 28 | class TrumpStreamListener(tweepy.StreamListener):
 29 | 
 30 |     def on_status(self, status):
 31 |         if status.author.id_str == realDonaldTrump:
 32 |             tweet = pd.DataFrame(columns=['created_at',
 33 |                                           'favorite_count',
 34 |                                           'id_str',
 35 |                                           'in_reply_to_user_id_str',
 36 |                                           'is_retweet',
 37 |                                           'retweet_count',
 38 |                                           'source',
 39 |                                           'text'])
 40 |             tweet.loc[0] = [status.created_at,
 41 |                             status.favorite_count,
 42 |                             status.id_str,
 43 |                             status.in_reply_to_user_id_str,
 44 |                             status.retweeted,
 45 |                             status.retweet_count,
 46 |                             status.source,
 47 |                             status.text]
 48 |             prediction = predict_author(tweet)
 49 |             post_tweet(status, prediction)
 50 | 
 51 |     def on_error(self, status_code):
 52 |         if status_code == 420:
 53 |             # returning False in on_data disconnects the stream
 54 |             print('Hit rate limit, pausing 60 seconds')
 55 |             sleep(60)
 56 |             return True
 57 | 
 58 | 
 59 | def post_tweet(status, prediction):
 60 |     '''Takes a tweet, formats the response, and posts to Twitter
 61 |     INPUT: string
 62 |     OUTPUT:
 63 |     '''
 64 |     url = ('https://twitter.com/' + status.user.screen_name +
 65 |            '/status/' + status.id_str)
 66 |     text = str(status.text)
 67 |     if len(text) >= 114:
 68 |         text = text[:114] + '…'
 69 | 
 70 |     proba = .99 if prediction[1] > .99 else prediction[1]
 71 | 
 72 |     if prediction[0] == 0:
 73 |         tweet = ('I am {0:.0%} confident an aide wrote this:\n'
 74 |                  '"{1}"\n'
 75 |                  '@realDonaldTrump {2}'.
 76 |                  format((1 - proba), text, url))
 77 |     else:
 78 |         tweet = ('I am {0:.0%} confident Trump wrote this:\n'
 79 |                  '"{1}"\n'
 80 |                  '@realDonaldTrump {2}'.
 81 |                  format(proba, text, url))
 82 |     print(tweet)
 83 |     print()
 84 |     api.update_status(tweet)
 85 | 
 86 | 
 87 | def predict_author(tweet):
 88 |     return model.predict(tweet)
 89 | 
 90 | 
 91 | def first_tweet(api):
 92 |     api.update_with_media('images/trump_ticker.gif',
 93 |                           status="Stay tuned!...")
 94 | 
 95 | 
 96 | def start_stream():
 97 |     while True:
 98 |         try:
 99 |             trumpstream = tweepy.Stream(auth, trumpstreamlistener)
100 |             trumpstream.filter(follow=[realDonaldTrump])
101 |         except:
102 |             continue
103 | 
104 | 
105 | trumpstreamlistener = TrumpStreamListener()
106 | print('Ready!')
107 | start_stream()
108 | 


--------------------------------------------------------------------------------
/src/feature_pipeline.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from src.vader_sentiment import apply_vader
  4 | from src.text_emotion import text_emotion
  5 | from src.style import apply_avg_lengths, tweet_length, punctuation_columns, \
  6 |                       quoted_retweet, apply_all_caps, mention_hashtag_url, \
  7 |                       mention_start, random_capitalization
  8 | from src.tweetstorm import tweetstorm
  9 | from src.time_of_day import time_of_day, period_of_day, day_of_week, weekend
 10 | from src.part_of_speech import pos_tagging, ner_tagging
 11 | from src.tweetokenizer import tweet_tokenize, tweet_tokens
 12 | 
 13 | 
 14 | def feature_pipeline(df, verbose=False):
 15 |     # =========================================================================
 16 |     # Feature engineering
 17 |     # =========================================================================
 18 |     if verbose:
 19 |         print()
 20 |         print('Feature engineering')
 21 | 
 22 |     # Dummify is_reply column
 23 |     if verbose:
 24 |         print('   dummifying is_reply column')
 25 |     df['in_reply_to_user_id_str'].fillna(0, inplace=True)
 26 |     df['is_reply'] = np.where(df['in_reply_to_user_id_str'], 1, 0)
 27 | 
 28 |     # Create columns for vader sentiment
 29 |     if verbose:
 30 |         print('   calculating vader sentiment')
 31 |     df = apply_vader(df, 'text')
 32 | 
 33 |     # Create columns for NRC Emotion Lexicon
 34 |     if verbose:
 35 |         print('   calculating NRC Emotion Lexicon score')
 36 |     df = text_emotion(df, 'text')
 37 | 
 38 |     # Create columns for average tweet, sentence, and word length of tweet
 39 |     if verbose:
 40 |         print('   calculating average sentence and word length')
 41 |     df = apply_avg_lengths(df, 'text')
 42 | 
 43 |     # Create columns for counts of punctuation
 44 |     if verbose:
 45 |         print('   calculating punctuation counts')
 46 |     punctuation_dict = {'commas': ',', 'semicolons': ';', 'exclamations': '!',
 47 |                         'periods': '.', 'questions': '?', 'quotes': '"',
 48 |                         'ellipses': '...'}
 49 | 
 50 |     df = punctuation_columns(df, 'text', punctuation_dict)
 51 | 
 52 |     # Create columns for counts of @mentions, #hashtags, and urls
 53 |     if verbose:
 54 |         print('   calculating mentions, hashtags, and url counts')
 55 |     df = mention_hashtag_url(df, 'text')
 56 | 
 57 |     # Create column identifying if the tweet is surrounding by quote marks
 58 |     if verbose:
 59 |         print('   calculating quoted retweet')
 60 |     df = quoted_retweet(df, 'text')
 61 | 
 62 |     # Create column indicating the count of fully capitalized words in a tweet
 63 |     if verbose:
 64 |         print('   calculating fully capitalized word counts')
 65 |     df = apply_all_caps(df, 'text')
 66 | 
 67 |     # Create column identifying if the tweet is part of a tweetstorm
 68 |     # if verbose:
 69 |     #     print('   calculating tweetstorm')
 70 |     # df = tweetstorm(df, 'text', 'source', 'created_at', 600)
 71 | 
 72 |     # Create column identifying the hour of the day that the tweet was posted
 73 |     if verbose:
 74 |         print('   calculating time of day')
 75 |     df = time_of_day(df, 'created_at')
 76 | 
 77 |     # Create column identifying the day of the week that the tweet was posted
 78 |     if verbose:
 79 |         print('   calculating day of week')
 80 |     df = day_of_week(df, 'created_at')
 81 | 
 82 |     # Create column identifying if the day of the week occurred on a weekend
 83 |     if verbose:
 84 |         print('   calculating weekend')
 85 |     df = weekend(df, 'day_of_week')
 86 | 
 87 |     # Create column identifying the period of the day, in 6-hour increments
 88 |     if verbose:
 89 |         print('   calculating period of day')
 90 |     df = period_of_day(df, 'created_at')
 91 | 
 92 |     # Create column finding the number of randomly capitalized words
 93 |     if verbose:
 94 |         print('   calculating randomly capitalized words')
 95 |     df = random_capitalization(df, 'text')
 96 | 
 97 |     # Create column of tweetokenize tweets
 98 |     if verbose:
 99 |         print('   calculating tweetokenize tweets')
100 |     df = tweet_tokenize(df, 'text')
101 | 
102 |     # Create column identifying if the tweet begins with an @mentions
103 |     if verbose:
104 |         print('   calculating @mention beginnings')
105 |     df['start_mention'] = df['tweetokenize'].apply(mention_start)
106 | 
107 |     # Part of speech tagging
108 |     if verbose:
109 |         print('   calculating part of speech')
110 |     df['pos'] = df['tweetokenize'].apply(pos_tagging)
111 | 
112 |     # Create ner column for Name Entity Recognition
113 |     if verbose:
114 |         print()
115 |         print('Performing NER')
116 |     df['ner'] = df['tweetokenize'].apply(ner_tagging)
117 | 
118 |     return df.drop(['source'], axis=1)
119 | 


--------------------------------------------------------------------------------
/src/style.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pandas as pd
  3 | from src import tweetokenizer as t
  4 | 
  5 | 
  6 | def sentence_word_length(text):
  7 |     '''
  8 |     Finds the average length of sentences and words in a given text
  9 |     INPUT: string
 10 |     OUTPUT: float(average sentence length), float(average word length)
 11 |     '''
 12 | 
 13 |     sentence_lengths = []
 14 |     word_lengths = []
 15 |     sentences = [s.strip() for s in re.split('[\.\?!]', text) if s]
 16 |     for sentence in sentences:
 17 |         words = sentence.split()
 18 |         word_lengths = word_lengths + [len(word) for word in words]
 19 |         sentence_length = len(words)
 20 |         sentence_lengths.append(sentence_length)
 21 |     return (sum(sentence_lengths) / float(len(sentence_lengths)),
 22 |             sum(word_lengths) / float(len(word_lengths)))
 23 | 
 24 | 
 25 | def apply_avg_lengths(df, column):
 26 |     '''
 27 |     Takes a DataFrame with a specified column of text and adds two new columns
 28 |     to the DataFrame, corresponding to the average sentence and word lengths
 29 |     INPUT: DataFrame, string
 30 |     OUTPUT: the original DataFrame with two additional columns
 31 |     '''
 32 | 
 33 |     avg_lengths = pd.DataFrame(df[column].apply(sentence_word_length))
 34 |     unpacked = pd.DataFrame([d for idx, d in avg_lengths[column].iteritems()],
 35 |                             index=avg_lengths.index)
 36 |     unpacked.columns = ['avg_sentence_length', 'avg_word_length']
 37 |     return pd.concat([df, unpacked], axis=1)
 38 | 
 39 | 
 40 | def tweet_length(df, column):
 41 |     '''
 42 |     Takes a DataFrame and the name of a column of text and creates a new
 43 |     column containing the count of characters of the text
 44 |     INPUT: DataFrame, string
 45 |     OUTPUT: the original DataFrame, with one new column
 46 |     '''
 47 | 
 48 |     new_df = df.copy()
 49 |     new_df['tweet_length'] = new_df[column].str.len()
 50 |     return new_df
 51 | 
 52 | 
 53 | def count_character(text, character):
 54 |     '''
 55 |     Takes a text string and a character and outputs the number of occurances
 56 |     of that character in the text
 57 |     INPUT: text string, character string
 58 |     OUTPUT: int
 59 |     '''
 60 | 
 61 |     return text.count(character)
 62 | 
 63 | 
 64 | def punctuation_columns(df, column, punctuation_dict):
 65 |     '''
 66 |     Takes a DataFrame, a column of text, and a dictionary with keys = character
 67 |     names and values = character, for example {'comma':','}. Creates new
 68 |     columns containing the number of occurances specified punctuation
 69 |     INPUT: DataFrame, string of column name, dictionary
 70 |     OUTPUT: original DataFrame with new columns
 71 |     '''
 72 | 
 73 |     new_df = df.copy()
 74 |     for idx in range(len(punctuation_dict)):
 75 |         col = pd.DataFrame(df[column].apply(count_character,
 76 |                            character=list(punctuation_dict.values())[idx]))
 77 |         col.columns = [list(punctuation_dict.keys())[idx]]
 78 |         new_df = pd.concat([new_df, col], axis=1)
 79 | 
 80 |     return new_df
 81 | 
 82 | 
 83 | def mention_hashtag_url(df, column):
 84 |     '''
 85 |     Takes a DataFrame and a specified column of tweetokenized tweets and
 86 |     creates new columns containing the count of @mentions, #hashtags, and URLs
 87 |     in the tweet
 88 |     INPUT: DataFrame, string
 89 |     OUTPUT: the original DataFrame with four new columns
 90 |     '''
 91 | 
 92 |     new_df = t.tweet_tokenize(df, 'text')
 93 |     new_df['mentions'] = new_df['tweetokenize'].apply(
 94 |                          lambda x: x.count('<USER>'))
 95 |     new_df['hashtags'] = new_df['tweetokenize'].apply(
 96 |                          lambda x: x.count('<HASHTAG>'))
 97 |     new_df['urls'] = new_df['tweetokenize'].apply(
 98 |                          lambda x: x.count('<URL>'))
 99 |     return new_df
100 | 
101 | 
102 | def identify_quoted_retweet(text):
103 |     '''
104 |     Takes a string of text and returns 1 if the text begins with '"@' and a 0
105 |     if not
106 |     INPUT: string
107 |     OUTPUT: int
108 |     '''
109 | 
110 |     return (0 if re.match('^"@', text) is None else 1)
111 | 
112 | 
113 | def quoted_retweet(df, column):
114 |     '''
115 |     Takes a DataFrame and a column of text and creates a new colun with 1 if
116 |     the text is fully surrounded by quote marks and a 0 if not
117 |     INPUT: DataFrame, String of column name
118 |     OUPUT: original DataFrame with one new column
119 |     '''
120 | 
121 |     quote = pd.DataFrame(df[column].apply(identify_quoted_retweet),
122 |                          index=df.index)
123 |     quote.columns = ['is_quoted_retweet']
124 |     return pd.concat([df, quote], axis=1)
125 | 
126 | 
127 | def all_caps(text):
128 |     '''
129 |     Takes a string of text and counts the number of ALL UPPERCASE words
130 |     INPUT: string
131 |     OUTPUT: int
132 |     '''
133 | 
134 |     return (len(re.findall('\s([A-Z][A-Z]+)', text)))
135 | 
136 | 
137 | def apply_all_caps(df, column):
138 |     '''
139 |     Takes a DataFrame and a specified column of text and creates a new column
140 |     with the count of fully capitalized words in the text
141 |     INPUT: DataFrame, string
142 |     OUTPUT: the original DataFrame with one new column
143 |     '''
144 | 
145 |     new_df = df.copy()
146 |     new_df['all_caps'] = new_df[column].apply(all_caps)
147 |     return new_df
148 | 
149 | 
150 | def random_capitalization(df, column):
151 |     '''
152 |     Takes a DataFrame and a specified column of text and creates a new column
153 |     with the count of randomly capitalized words in the text
154 |     INPUT: DataFrame, string
155 |     OUTPUT: the original DataFrame with one new column
156 |     '''
157 | 
158 |     new_df = df.copy()
159 |     exp = r"(?<!\.\s)(?<!\!\s)(?<!\?\s)\b[A-Z][a-z]*[^'][^I]\b"
160 |     new_df['random_caps'] = new_df[column].apply(lambda x:
161 |                                                  len(re.findall(exp, x)))
162 |     return new_df
163 | 
164 | 
165 | def mention_start(text):
166 |     '''
167 |     Takes a text string and outputs 1 if the string begins with "<USER>" and
168 |     0 if not.
169 |     INPUT: string
170 |     OUTPUT: int
171 |     '''
172 |     return 1 if text[:6] == '<USER>' else 0
173 | 


--------------------------------------------------------------------------------
/twitterbot_rf.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tweepy
  3 | import pandas as pd
  4 | import pickle
  5 | from TweetAuthorshipPredictor import TweetAuthorshipPredictor
  6 | from src.feature_pipeline import feature_pipeline
  7 | 
  8 | 
  9 | credentials = json.load(open('.env/twitter_credentials.json'))
 10 | 
 11 | consumer_key = credentials['consumer_key']
 12 | consumer_secret = credentials['consumer_secret']
 13 | access_token = credentials['access_token']
 14 | access_token_secret = credentials['access_token_secret']
 15 | 
 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
 17 | auth.set_access_token(access_token, access_token_secret)
 18 | api = tweepy.API(auth)
 19 | 
 20 | realDonaldTrump = '25073877'
 21 | # realDonaldTrump = '14649582'
 22 | 
 23 | with open('twitterbot_pickles/rf.pkl', 'rb') as trump:
 24 |     model = pickle.load(trump)
 25 | 
 26 | std = ['compound', 'anger', 'anticipation', 'disgust', 'fear',
 27 |        'joy', 'negative', 'positive', 'sadness', 'surprise',
 28 |        'trust', 'tweet_length', 'avg_sentence_length',
 29 |        'avg_word_length', 'commas', 'semicolons', 'exclamations',
 30 |        'periods', 'questions', 'quotes', 'ellipses', 'mentions',
 31 |        'hashtags', 'urls', 'all_caps', 'hour', 'random_caps']
 32 | 
 33 | feat = ['created_at', 'is_retweet', 'text', 'is_reply',
 34 |         'compound', 'v_negative', 'v_neutral', 'v_positive',
 35 |         'anger', 'anticipation', 'disgust', 'fear', 'joy',
 36 |         'negative', 'positive', 'sadness', 'surprise', 'trust',
 37 |         'tweet_length', 'avg_sentence_length', 'avg_word_length',
 38 |         'commas', 'semicolons', 'exclamations', 'periods',
 39 |         'questions', 'quotes', 'ellipses', 'mentions', 'hashtags',
 40 |         'urls', 'is_quoted_retweet', 'all_caps', 'tweetstorm',
 41 |         'hour', 'hour_20_02', 'hour_14_20', 'hour_08_14',
 42 |         'hour_02_08', 'day_of_week', 'weekend', 'random_caps',
 43 |         'start_mention', 'ner', 'pos']
 44 | 
 45 | 
 46 | def load_pickle(filename):
 47 |     # Open pickle filename
 48 |     print('Pickle load', filename)
 49 |     with open(filename, 'rb') as f:
 50 |         return pickle.load(f)
 51 | 
 52 | 
 53 | tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl')
 54 | tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl')
 55 | tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl')
 56 | text_cols = tfidf_text.get_feature_names()
 57 | ner_cols = tfidf_ner.get_feature_names()
 58 | pos_cols = tfidf_pos.get_feature_names()
 59 | scaler = load_pickle('twitterbot_pickles/scaler.pkl')
 60 | top_feats = load_pickle('twitterbot_pickles/top_feats.pkl')
 61 | 
 62 | 
 63 | class TrumpStreamListener(tweepy.StreamListener):
 64 | 
 65 |     def on_status(self, status):
 66 |         if status.author.id_str == realDonaldTrump:
 67 |             tweet = df = pd.DataFrame(columns=['created_at',
 68 |                                                'favorite_count',
 69 |                                                'id_str',
 70 |                                                'in_reply_to_user_id_str',
 71 |                                                'is_retweet',
 72 |                                                'retweet_count',
 73 |                                                'source',
 74 |                                                'text'])
 75 | 
 76 |             tweet.loc[0] = [status.created_at,
 77 |                             status.favorite_count,
 78 |                             status.id_str,
 79 |                             status.in_reply_to_user_id_str,
 80 |                             status.retweeted,
 81 |                             status.retweet_count,
 82 |                             status.source,
 83 |                             status.text]
 84 |             prediction = predict_author(tweet)
 85 |             post_tweet(status, prediction)
 86 | 
 87 | 
 88 | def post_tweet(status, prediction):
 89 |     '''Takes a tweet, formats the response, and posts to Twitter
 90 |     INPUT: string
 91 |     OUTPUT:
 92 |     '''
 93 |     url = str('https://twitter.com/realDonaldTrump/status/' + status.id_str)
 94 |     url = ('https://twitter.com/' + status.user.screen_name +
 95 |            '/status/' + status.id_str)
 96 |     text = str(status.text)
 97 | 
 98 |     if prediction[0] == 0:
 99 |         proba = .99 if prediction[1][0][0] > .99 else prediction[1][0][0]
100 |         tweet = ('I am {0:.0%} confident an aide wrote this:\n"{1}..."'
101 |                  '\n@realDonaldTrump\n'
102 |                  '{2}'.
103 |                  format(proba, text[:150], url))
104 |     else:
105 |         proba = .99 if prediction[1][0][1] > .99 else prediction[1][0][1]
106 |         tweet = ('I am {0:.0%} confident Trump wrote this:\n"{1}..."'
107 |                  '\n@realDonaldTrump\n'
108 |                  '{2}'.
109 |                  format(proba, text[:150], url))
110 |     print(tweet)
111 |     print()
112 |     api.update_status(tweet)
113 | 
114 | 
115 | def predict_author(tweet):
116 |     X, X_std = prepare_data_for_predict(tweet)
117 |     X = X[top_feats[:200]]
118 |     return model.predict(X), model.predict_proba(X)
119 | 
120 | 
121 | def first_tweet(api):
122 |     api.update_with_media('images/trump_ticker.gif',
123 |                           status="Stay tuned!...")
124 | 
125 | 
126 | def prepare_data_for_predict(X):
127 |     ''' Processes the X data with all features and standardizes.
128 |     '''
129 |     # Create new feature columns
130 |     X = feature_pipeline(X)
131 |     X = tfidf_transform(X[feat])
132 |     X_std = standardize(X)
133 |     return X, X_std
134 | 
135 | 
136 | def tfidf_transform(X):
137 |     '''Performs a tf-idf transform on the given column of data
138 |     '''
139 |     X.reset_index(drop=True, inplace=True)
140 |     _tfidf_text = tfidf_text.transform(X['text'])
141 |     _tfidf_text = pd.DataFrame(_tfidf_text.todense(),
142 |                                columns=[text_cols])
143 | 
144 |     _tfidf_ner = tfidf_ner.transform(X['ner'])
145 |     _tfidf_ner = pd.DataFrame(_tfidf_ner.todense(),
146 |                               columns=[ner_cols])
147 | 
148 |     _tfidf_pos = tfidf_pos.transform(X['pos'])
149 |     _tfidf_pos = pd.DataFrame(_tfidf_pos.todense(),
150 |                               columns=[pos_cols])
151 | 
152 |     X = tfidf_remove_dups(X, _tfidf_text, _tfidf_pos, _tfidf_ner)
153 | 
154 |     return X
155 | 
156 | 
157 | def tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner):
158 |     '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from
159 |     tfidf_text, and concatentates the DataFrames
160 |     '''
161 |     # Drop ner columns also present in tfidf_text
162 |     columns_to_keep = [x for x in tfidf_ner
163 |                        if x not in tfidf_text]
164 |     tfidf_ner = tfidf_ner[columns_to_keep]
165 | 
166 |     # Drop pos columns also present in ner
167 |     columns_to_keep = [x for x in tfidf_pos
168 |                        if x not in tfidf_ner]
169 |     tfidf_pos = tfidf_pos[columns_to_keep]
170 | 
171 |     X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1)
172 |     return X
173 | 
174 | 
175 | def standardize(X):
176 |     print('Performing Standardization')
177 |     X_std = X.copy()
178 |     cols = X[std].columns
179 |     X_std[std] = pd.DataFrame(scaler.transform(
180 |                               X[std]),
181 |                               index=X.index,
182 |                               columns=cols)
183 |     return X_std
184 | 
185 | 
186 | def start_stream():
187 |     while True:
188 |         try:
189 |             trumpstream = tweepy.Stream(auth, trumpstreamlistener)
190 |             trumpstream.filter(follow=[realDonaldTrump])
191 |         except:
192 |             continue
193 | 
194 | 
195 | trumpstreamlistener = TrumpStreamListener()
196 | start_stream()
197 | 


--------------------------------------------------------------------------------
/twitterbot_knn.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tweepy
  3 | import pandas as pd
  4 | import pickle
  5 | from TweetAuthorshipPredictor import TweetAuthorshipPredictor
  6 | from src.feature_pipeline import feature_pipeline
  7 | 
  8 | 
  9 | credentials = json.load(open('.env/twitter_credentials.json'))
 10 | 
 11 | consumer_key = credentials['consumer_key']
 12 | consumer_secret = credentials['consumer_secret']
 13 | access_token = credentials['access_token']
 14 | access_token_secret = credentials['access_token_secret']
 15 | 
 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
 17 | auth.set_access_token(access_token, access_token_secret)
 18 | api = tweepy.API(auth)
 19 | 
 20 | realDonaldTrump = '25073877'
 21 | # realDonaldTrump = '14649582'
 22 | 
 23 | with open('pickle/ensemble_knn.pkl', 'rb') as trump:
 24 |     model = pickle.load(trump)
 25 | 
 26 | std = ['compound', 'anger', 'anticipation', 'disgust', 'fear',
 27 |        'joy', 'negative', 'positive', 'sadness', 'surprise',
 28 |        'trust', 'tweet_length', 'avg_sentence_length',
 29 |        'avg_word_length', 'commas', 'semicolons', 'exclamations',
 30 |        'periods', 'questions', 'quotes', 'ellipses', 'mentions',
 31 |        'hashtags', 'urls', 'all_caps', 'hour', 'random_caps']
 32 | 
 33 | feat = ['created_at', 'is_retweet', 'text', 'is_reply',
 34 |         'compound', 'v_negative', 'v_neutral', 'v_positive',
 35 |         'anger', 'anticipation', 'disgust', 'fear', 'joy',
 36 |         'negative', 'positive', 'sadness', 'surprise', 'trust',
 37 |         'tweet_length', 'avg_sentence_length', 'avg_word_length',
 38 |         'commas', 'semicolons', 'exclamations', 'periods',
 39 |         'questions', 'quotes', 'ellipses', 'mentions', 'hashtags',
 40 |         'urls', 'is_quoted_retweet', 'all_caps', 'tweetstorm',
 41 |         'hour', 'hour_20_02', 'hour_14_20', 'hour_08_14',
 42 |         'hour_02_08', 'day_of_week', 'weekend', 'random_caps',
 43 |         'start_mention', 'ner', 'pos']
 44 | 
 45 | 
 46 | def load_pickle(filename):
 47 |     # Open pickle filename
 48 |     print('Pickle load', filename)
 49 |     with open(filename, 'rb') as f:
 50 |         return pickle.load(f)
 51 | 
 52 | 
 53 | knn_pca = load_pickle('twitterbot_pickles/knn_pca.pkl')
 54 | tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl')
 55 | tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl')
 56 | tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl')
 57 | text_cols = tfidf_text.get_feature_names()
 58 | ner_cols = tfidf_ner.get_feature_names()
 59 | pos_cols = tfidf_pos.get_feature_names()
 60 | scaler = load_pickle('twitterbot_pickles/scaler.pkl')
 61 | top_feats = load_pickle('twitterbot_pickles/top_feats.pkl')
 62 | 
 63 | 
 64 | class TrumpStreamListener(tweepy.StreamListener):
 65 | 
 66 |     def on_status(self, status):
 67 |         if status.author.id_str == realDonaldTrump:
 68 |             tweet = df = pd.DataFrame(columns=['created_at',
 69 |                                                'favorite_count',
 70 |                                                'id_str',
 71 |                                                'in_reply_to_user_id_str',
 72 |                                                'is_retweet',
 73 |                                                'retweet_count',
 74 |                                                'source',
 75 |                                                'text'])
 76 | 
 77 |             tweet.loc[0] = [status.created_at,
 78 |                             status.favorite_count,
 79 |                             status.id_str,
 80 |                             status.in_reply_to_user_id_str,
 81 |                             status.retweeted,
 82 |                             status.retweet_count,
 83 |                             status.source,
 84 |                             status.text]
 85 |             prediction = predict_author(tweet)
 86 |             post_tweet(status, prediction)
 87 | 
 88 | 
 89 | def post_tweet(status, prediction):
 90 |     '''Takes a tweet, formats the response, and posts to Twitter
 91 |     INPUT: string
 92 |     OUTPUT:
 93 |     '''
 94 |     url = str('https://twitter.com/realDonaldTrump/status/' + status.id_str)
 95 |     url = ('https://twitter.com/' + status.user.screen_name +
 96 |            '/status/' + status.id_str)
 97 |     text = str(status.text)
 98 | 
 99 |     if prediction[0] == 0:
100 |         proba = .99 if prediction[1][0][0] > .99 else prediction[1][0][0]
101 |         tweet = ('I am {0:.0%} confident an aide wrote this:\n"{1}..."'
102 |                  '\n@realDonaldTrump\n'
103 |                  '{2}'.
104 |                  format(proba, text[:150], url))
105 |     else:
106 |         proba = .99 if prediction[1][0][1] > .99 else prediction[1][0][1]
107 |         tweet = ('I am {0:.0%} confident Trump wrote this:\n"{1}..."'
108 |                  '\n@realDonaldTrump\n'
109 |                  '{2}'.
110 |                  format(proba, text[:150], url))
111 |     print(tweet)
112 |     print()
113 |     api.update_status(tweet)
114 | 
115 | 
116 | def predict_author(tweet):
117 |     X, X_std = prepare_data_for_predict(tweet)
118 |     X_knn = knn_pca.transform(X_std[top_feats[:13]])
119 |     return model.predict(X_knn), model.predict_proba(X_knn)
120 | 
121 | 
122 | def first_tweet(api):
123 |     api.update_with_media('images/trump_ticker.gif',
124 |                           status="Stay tuned!...")
125 | 
126 | 
127 | def prepare_data_for_predict(X):
128 |     ''' Processes the X data with all features and standardizes.
129 |     '''
130 |     # Create new feature columns
131 |     X = feature_pipeline(X)
132 |     X = tfidf_transform(X[feat])
133 |     X_std = standardize(X)
134 |     return X, X_std
135 | 
136 | 
137 | def tfidf_transform(X):
138 |     '''Performs a tf-idf transform on the given column of data
139 |     '''
140 |     X.reset_index(drop=True, inplace=True)
141 |     _tfidf_text = tfidf_text.transform(X['text'])
142 |     _tfidf_text = pd.DataFrame(_tfidf_text.todense(),
143 |                                columns=[text_cols])
144 | 
145 |     _tfidf_ner = tfidf_ner.transform(X['ner'])
146 |     _tfidf_ner = pd.DataFrame(_tfidf_ner.todense(),
147 |                               columns=[ner_cols])
148 | 
149 |     _tfidf_pos = tfidf_pos.transform(X['pos'])
150 |     _tfidf_pos = pd.DataFrame(_tfidf_pos.todense(),
151 |                               columns=[pos_cols])
152 | 
153 |     X = tfidf_remove_dups(X, _tfidf_text, _tfidf_pos, _tfidf_ner)
154 | 
155 |     return X
156 | 
157 | 
158 | def tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner):
159 |     '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from
160 |     tfidf_text, and concatentates the DataFrames
161 |     '''
162 |     # Drop ner columns also present in tfidf_text
163 |     columns_to_keep = [x for x in tfidf_ner
164 |                        if x not in tfidf_text]
165 |     tfidf_ner = tfidf_ner[columns_to_keep]
166 | 
167 |     # Drop pos columns also present in ner
168 |     columns_to_keep = [x for x in tfidf_pos
169 |                        if x not in tfidf_ner]
170 |     tfidf_pos = tfidf_pos[columns_to_keep]
171 | 
172 |     X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1)
173 |     return X
174 | 
175 | 
176 | def standardize(X):
177 |     print('Performing Standardization')
178 |     X_std = X.copy()
179 |     cols = X[std].columns
180 |     X_std[std] = pd.DataFrame(scaler.transform(
181 |                               X[std]),
182 |                               index=X.index,
183 |                               columns=cols)
184 |     return X_std
185 | 
186 | 
187 | def start_stream():
188 |     while True:
189 |         try:
190 |             trumpstream = tweepy.Stream(auth, trumpstreamlistener)
191 |             trumpstream.filter(follow=[realDonaldTrump])
192 |         except:
193 |             continue
194 | 
195 | 
196 | trumpstreamlistener = TrumpStreamListener()
197 | start_stream()
198 | 


--------------------------------------------------------------------------------
/twitterbot_mini_ensemble.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import tweepy
  3 | import pandas as pd
  4 | import numpy as np
  5 | import pickle
  6 | from src.feature_pipeline import feature_pipeline
  7 | 
  8 | 
  9 | credentials = json.load(open('.env/twitter_credentials.json'))
 10 | 
 11 | consumer_key = credentials['consumer_key']
 12 | consumer_secret = credentials['consumer_secret']
 13 | access_token = credentials['access_token']
 14 | access_token_secret = credentials['access_token_secret']
 15 | 
 16 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
 17 | auth.set_access_token(access_token, access_token_secret)
 18 | api = tweepy.API(auth)
 19 | 
 20 | realDonaldTrump = '25073877'
 21 | # realDonaldTrump = '14649582'
 22 | 
 23 | std = ['compound', 'anger', 'anticipation', 'disgust', 'fear',
 24 |        'joy', 'negative', 'positive', 'sadness', 'surprise',
 25 |        'trust', 'tweet_length', 'avg_sentence_length',
 26 |        'avg_word_length', 'commas', 'semicolons', 'exclamations',
 27 |        'periods', 'questions', 'quotes', 'ellipses', 'mentions',
 28 |        'hashtags', 'urls', 'all_caps', 'hour', 'random_caps']
 29 | 
 30 | feat = ['created_at', 'is_retweet', 'text', 'is_reply',
 31 |         'compound', 'v_negative', 'v_neutral', 'v_positive',
 32 |         'anger', 'anticipation', 'disgust', 'fear', 'joy',
 33 |         'negative', 'positive', 'sadness', 'surprise', 'trust',
 34 |         'tweet_length', 'avg_sentence_length', 'avg_word_length',
 35 |         'commas', 'semicolons', 'exclamations', 'periods',
 36 |         'questions', 'quotes', 'ellipses', 'mentions', 'hashtags',
 37 |         'urls', 'is_quoted_retweet', 'all_caps', 'tweetstorm',
 38 |         'hour', 'hour_20_02', 'hour_14_20', 'hour_08_14',
 39 |         'hour_02_08', 'day_of_week', 'weekend', 'random_caps',
 40 |         'start_mention', 'ner', 'pos']
 41 | 
 42 | 
 43 | def load_pickle(filename):
 44 |     # Open pickle filename
 45 |     print('Pickle load', filename)
 46 |     with open(filename, 'rb') as f:
 47 |         return pickle.load(f)
 48 | 
 49 | 
 50 | rf = load_pickle('twitterbot_pickles/rf.pkl')
 51 | gb = load_pickle('twitterbot_pickles/gb.pkl')
 52 | knn = load_pickle('twitterbot_pickles/knn.pkl')
 53 | knn_pca = load_pickle('twitterbot_pickles/knn_pca.pkl')
 54 | tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl')
 55 | tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl')
 56 | tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl')
 57 | text_cols = tfidf_text.get_feature_names()
 58 | ner_cols = tfidf_ner.get_feature_names()
 59 | pos_cols = tfidf_pos.get_feature_names()
 60 | scaler = load_pickle('twitterbot_pickles/scaler.pkl')
 61 | top_feats = load_pickle('twitterbot_pickles/top_feats.pkl')
 62 | 
 63 | 
 64 | class TrumpStreamListener(tweepy.StreamListener):
 65 | 
 66 |     def on_status(self, status):
 67 |         if status.author.id_str == realDonaldTrump:
 68 |             tweet = df = pd.DataFrame(columns=['created_at',
 69 |                                                'favorite_count',
 70 |                                                'id_str',
 71 |                                                'in_reply_to_user_id_str',
 72 |                                                'is_retweet',
 73 |                                                'retweet_count',
 74 |                                                'source',
 75 |                                                'text'])
 76 | 
 77 |             tweet.loc[0] = [status.created_at,
 78 |                             status.favorite_count,
 79 |                             status.id_str,
 80 |                             status.in_reply_to_user_id_str,
 81 |                             status.retweeted,
 82 |                             status.retweet_count,
 83 |                             status.source,
 84 |                             status.text]
 85 |             prediction = predict_author(tweet)
 86 |             post_tweet(status, prediction)
 87 | 
 88 | 
 89 | def post_tweet(status, prediction):
 90 |     '''Takes a tweet, formats the response, and posts to Twitter
 91 |     INPUT: string
 92 |     OUTPUT:
 93 |     '''
 94 |     url = str('https://twitter.com/realDonaldTrump/status/' + status.id_str)
 95 |     url = ('https://twitter.com/' + status.user.screen_name +
 96 |            '/status/' + status.id_str)
 97 |     text = str(status.text)
 98 | 
 99 |     if prediction[0] == 0:
100 |         proba = .99 if prediction[1][0][0] > .99 else prediction[1][0][0]
101 |         tweet = ('I am {0:.0%} confident an aide wrote this:\n"{1}..."'
102 |                  '\n@realDonaldTrump\n'
103 |                  '{2}'.
104 |                  format(proba, text[:150], url))
105 |     else:
106 |         proba = .99 if prediction[1][0][1] > .99 else prediction[1][0][1]
107 |         tweet = ('I am {0:.0%} confident Trump wrote this:\n"{1}..."'
108 |                  '\n@realDonaldTrump\n'
109 |                  '{2}'.
110 |                  format(proba, text[:150], url))
111 |     print(tweet)
112 |     print()
113 |     api.update_status(tweet)
114 | 
115 | 
116 | def predict_author(tweet):
117 |     X, X_std = prepare_data_for_predict(tweet)
118 |     X_rf = X[top_feats[:200]]
119 |     X_gb = X_std[top_feats[:300]]
120 |     X_knn = knn_pca.transform(X_std[top_feats[:13]])
121 | 
122 |     rf_results = rf.predict(X_rf), rf.predict_proba(X_rf)
123 |     gb_results = gb.predict(X_gb), gb.predict_proba(X_gb)
124 |     knn_results = knn.predict(X_knn), knn.predict_proba(X_knn)
125 | 
126 |     print(rf_results)
127 |     print(gb_results)
128 |     print(knn_results)
129 | 
130 |     total = sum([rf_results[0], gb_results[0], knn_results[0]])
131 |     majority = 1 if total > 1 else 0
132 | 
133 |     zero = -(rf_results[1][0][0] * (rf_results[0] - 1) +
134 |              gb_results[1][0][0] * (gb_results[0] - 1) +
135 |              knn_results[1][0][0] * (knn_results[0] - 1))
136 |     one = (rf_results[1][0][1] * rf_results[0] +
137 |            gb_results[1][0][1] * gb_results[0] +
138 |            knn_results[1][0][1] * knn_results[0])
139 | 
140 |     proba0 = zero / (3 - total) if total != 3 else 0
141 |     proba1 = one / total if total != 0 else 0
142 | 
143 |     return (np.array([majority]), np.array([[float(proba0), float(proba1)]]))
144 | 
145 | 
146 | def first_tweet(api):
147 |     api.update_with_media('images/trump_ticker.gif',
148 |                           status="Stay tuned!...")
149 | 
150 | 
151 | def prepare_data_for_predict(X):
152 |     ''' Processes the X data with all features and standardizes.
153 |     '''
154 |     # Create new feature columns
155 |     X = feature_pipeline(X)
156 |     X = tfidf_transform(X[feat])
157 |     X_std = standardize(X)
158 |     return X, X_std
159 | 
160 | 
161 | def tfidf_transform(X):
162 |     '''Performs a tf-idf transform on the given column of data
163 |     '''
164 |     X.reset_index(drop=True, inplace=True)
165 |     _tfidf_text = tfidf_text.transform(X['text'])
166 |     _tfidf_text = pd.DataFrame(_tfidf_text.todense(),
167 |                                columns=[text_cols])
168 | 
169 |     _tfidf_ner = tfidf_ner.transform(X['ner'])
170 |     _tfidf_ner = pd.DataFrame(_tfidf_ner.todense(),
171 |                               columns=[ner_cols])
172 | 
173 |     _tfidf_pos = tfidf_pos.transform(X['pos'])
174 |     _tfidf_pos = pd.DataFrame(_tfidf_pos.todense(),
175 |                               columns=[pos_cols])
176 | 
177 |     X = tfidf_remove_dups(X, _tfidf_text, _tfidf_pos, _tfidf_ner)
178 | 
179 |     return X
180 | 
181 | 
182 | def tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner):
183 |     '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from
184 |     tfidf_text, and concatentates the DataFrames
185 |     '''
186 |     # Drop ner columns also present in tfidf_text
187 |     columns_to_keep = [x for x in tfidf_ner
188 |                        if x not in tfidf_text]
189 |     tfidf_ner = tfidf_ner[columns_to_keep]
190 | 
191 |     # Drop pos columns also present in ner
192 |     columns_to_keep = [x for x in tfidf_pos
193 |                        if x not in tfidf_ner]
194 |     tfidf_pos = tfidf_pos[columns_to_keep]
195 | 
196 |     X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1)
197 |     return X
198 | 
199 | 
200 | def standardize(X):
201 |     X_std = X.copy()
202 |     cols = X[std].columns
203 |     X_std[std] = pd.DataFrame(scaler.transform(
204 |                               X[std]),
205 |                               index=X.index,
206 |                               columns=cols)
207 |     return X_std
208 | 
209 | 
210 | def start_stream():
211 |     while True:
212 |         try:
213 |             trumpstream = tweepy.Stream(auth, trumpstreamlistener)
214 |             trumpstream.filter(follow=[realDonaldTrump])
215 |         except:
216 |             continue
217 | 
218 | 
219 | trumpstreamlistener = TrumpStreamListener()
220 | start_stream()
221 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Trump-Tweet Author Identification
  2 | 
  3 | This project is an attempt to build a model which can identify if Trump is truly the author of any given tweet on his account, or if it was written and posted by an aide. It is widely reported in the news that many of his tweets are actually written by staff. With particularly controversial tweets, and especially given that he sometimes makes presidential decrees through his Twitter account, it can be enlightening to have greater certainty in the authorship of his tweets.
  4 | 
  5 | I have written some Medium posts about this project going into greater detail about my procedure and results. Those posts can be accessed here:
  6 | - [Who’s Tweeting from the Oval Office?](https://towardsdatascience.com/whos-tweeting-from-the-oval-office-96ea5b60c03)
  7 | - [Who’s Tweeting from the Oval Office? - Building a Twitter bot](https://towardsdatascience.com/whos-tweeting-from-the-oval-office-building-a-twitter-bot-9c602edf91dd)
  8 | 
  9 | ![Did Trump Tweet It?](images/trump_ticker.gif)
 10 | 
 11 | Did Trump tweet it? Or just an aide in Trump clothing?
 12 | 
 13 | ## Background
 14 | On December 1st, 2017, Michael Flynn pleaded guilty to lying to the FBI. The next day, Trump’s personal Twitter account tweeted:
 15 | 
 16 | ![I had to fire General Flynn because he lied to the Vice President and the FBI. He has pled guilty to those lies. It is a shame because his actions during the transition were lawful. There was nothing to hide!](images/flynn_tweet.png)
 17 | 
 18 | The controversy arises because on February 14th of that year, the day after Flynn resigned, Trump had asked James Comey, then the director of the FBI, to back off any investigations of Flynn. If Trump knew at the time of his request to Comey that Flynn had indeed lied to the FBI, then Trump's tweet could be seen as evidence that Trump attempted to obstruct justice. After several legal experts argued this point, Trump defended himself by claiming that his lawyer John Dowd wrote and posted the tweet. But did he really?
 19 | 
 20 | ***
 21 | 
 22 | Forensic text analysis was an early field in machine learning and has been used in cases as varied as identifying the Unabomber to discovering J.K. Rowling as the true identity of the author Robert Galbraith to determining the specific authors of each of the Federalist Papers. This project is an effort to use machine learning and these same techniques to identify tweets on [@realDonaldTrump](https://twitter.com/realdonaldtrump) as written by Trump himself or by his staff when using his account. This task, however, is particularly challenging due to the short nature of a tweet--there just isn't much signal to pick up in such a short text.
 23 | 
 24 | Prior to March 26, 2017, Trump was tweeting using a Samsung Galaxy device while his staff were tweeting using an iPhone. From this information provided in the metadata of each tweet, we know whether it was Trump himself or his staff tweeting (see [Further Reading](#further-reading) below for some articles discussing this assumption). After March however, Trump switched to using an iPhone as well, so identification of the tweeter cannot come from the metadata alone and must be deduced from the content of the tweet.
 25 | 
 26 | ### Potential Tweeters
 27 | 
 28 | These individuals have been reported in the news as possible tweeters on Trump's Twitter account. The Start Date is the date their association with the Trump Campaign or Administration was announced, and the end date is when their positions were terminated.
 29 | 
 30 | |Name|Start Date|End Date|Twitter Handle|
 31 | |----|----------|--------|--------------|
 32 | |Donald Trump|2009-05-04|present|@realDonaldTrump|
 33 | |Sean Spicer|2016-12-22|2017-07-21|@seanspicer|
 34 | |Reince Priebus|2016-11-13|2017-07-27|@Reince|
 35 | |Steve Bannon|2016-08-17|2017-08-18|@SteveKBannon|
 36 | |Kellyanne Conway|2016-07-01|present|@KellyannePolls|
 37 | |Anthony Scaramucci|2017-07-21|2017-07-31|@Scaramucci|
 38 | |Dan Scavino|2015-06-01|present|@DanScavino|
 39 | |John Dowd|2017-07-16|present|N/A|
 40 | 
 41 | 
 42 | ## Data
 43 | 
 44 | I used Brendan Brown's [Trump Tweet Data Archive](https://github.com/bpb27/trump_tweet_data_archive) to collect all tweets from the beginning of Trump's account in mid-2009 up until the end of 2017. This set consists of nearly 33,000 tweets. Even though I know from whose device a tweet originated, there is still some ambiguity around the authorship because Trump is known to dictate tweets to assistants, so a tweet may have Trump's characteristics but be posted from a non-Trump device, and also (especially during the campaign) to write tweets collaboratively with aides, making true authorship unclear.
 45 | 
 46 | ## Feature engineering
 47 | 
 48 | ### Style
 49 | I looked at the style of each tweet by counting various punctuation marks (the number of exclamation marks, for example), the number of @mentions and #hashtags, and average tweet/sentence/word length.
 50 | 
 51 | ### Trump quirks
 52 | I also created features for what I have recognized as Trump's rather unique Twitter behavior. These features include the "quoted retweet" (where Trump copies and pastes a another user's tweet onto his own timeline and surrounds it in quotation marks), words written in ALL CAPS or followed by several exclamation points!!!, and also middle-of-the-night tweeting.
 53 | 
 54 | ### Sentiment
 55 | I used C.J. Hutto's [VADER](https://github.com/cjhutto/vaderSentiment) package to extract the sentiment of each tweet. VADER, which stands for Valence Aware Dictionary and sEntiment Reasoning, is a lexicon and rule-based tool that is specifically tuned to social media. Given a string of text, it outputs a number between 0 and 1 for negativity, positivity, and neutrality
 56 | for the text, as well as a compound score from -1 to 1 which is an aggregate measure.
 57 | 
 58 | ### Emotion
 59 | The National Research Council of Canada created a [lexicon](http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) of over 14,000 words, each rated as belonging to any of 10 emotion classes. For each tweet, I counted the number of words for each emotion class and assigned the tweet that count score for each emotion.
 60 | 
 61 | ### Word choice
 62 | I performed TF-IDF on the text of each tweet in order to pick up vocabulary unique to Trump or his staff.
 63 | 
 64 | ### Grammatical structure
 65 | I knew the phrasing of Trump's tweets would stand out from that of his staff, so in order to capture this I performed part-of-speech replacement on each tweet, reducing it to a string of its parts of speech. For example, the phrase "Hello. This is a tweet which has been parsed for parts of speech" would be replaced with "UH . DT BZ DT NN WDT VBZ VBN VBN IN NNS IN NN ", using the [Penn part of speech tags](https://cs.nyu.edu/grishman/jet/guide/PennPOS.html).
 66 | 
 67 | 
 68 | ## Models
 69 | 
 70 | I created models for Naive Bayes, SVM, Logistic Regression with Ridge Regularization, KNN, and the ensemble methods of Random Forest, Gradient Boosting, and AdaBoost. All models achieved accuracy, precision, and recall rates in the low 90%s, except for Naive Bayes which was in the mid 80%s. For my final model, I found that an ensemble of these individual models worked best.
 71 | 
 72 | Additionally, I used the Ridge Regularization to iteratively drive each of the roughly 900 feature coefficients to zero with ever increasing alpha values. This allowed me to rank each feature in order of its importance to the logistic regression model. At an alpha-level of 3e22, the first feature dropped out when its regression coefficient was driven to zero. Slowly, more features dropped out until an alpha-level of about 10e25, when the feature dropout rapidly increased. Above an alpha-level of 10e26, the dropout rate slowed down, and these features still left are the most influential features in the model.
 73 | 
 74 | ![Ridge Regularization](images/ridge.png)
 75 | 
 76 | ## Results
 77 | 
 78 | One of the most interesting results from my analysis is the characteristics which identify a tweet as coming from Trump or from someone else. From my Ridge analysis, the top Trump features are:
 79 | 
 80 | * Quoted retweet
 81 | * @mentions
 82 | * Between 10pm and 10am
 83 | * Exclamations!!!
 84 | * ALL CAPS
 85 | * Tweet length: 114 characters
 86 | * @realDonaldTrump
 87 | 
 88 | The top features of non-Trump tweets are:
 89 | 
 90 | * True retweets
 91 | * The word “via”
 92 | * Between 10am and 4pm
 93 | * Semicolons
 94 | * Periods
 95 | * Tweet length: 103 characters
 96 | * @BarackObama
 97 | 
 98 | Trump's tweets are in general more emotive than his aides' tweets, exhibiting high scores for the emotions surprise, anger, negativity, disgust, joy, sadness, and fear. Non-Trump tweets, in contrast, are relatively unemotional, and feature many URLs, hashtags, and organization names.
 99 | 
100 | As for the models, Random Forest performed the best on its own, with AdaBoost a close second. Naive Bayes performed most poorly of the models tested.
101 | 
102 | |   |Gradient Boosting|Random Forest|AdaBoost|Logistic Regression|KNN|SVM|Naive Bayes|
103 | |---:|:--------------:|:-----------:|:------:|:-------------:|:---:|:---:|:---------:|
104 | |Accuracy|95%|94%|92%|90%|90%|90%|84%|
105 | |Precision|95%|94%|92%|90%|91%|90%|83%|
106 | |Recall|95%|95%|90%|88%|89%|90%|82%|
107 | 
108 | For my final model, I created an ensemble of all seven models, using the majority class as my predictor.
109 | 
110 | ## The Flynn Tweet
111 | 
112 | And as for that Flynn Tweet? My analysis indicates it was most likely not written by Trump. However, my models are split evenly on this one. Some predict Trump, others not Trump. The Logistic Regression outputs a probability estimate of 97% and Naive Bayes of 94% that it did indeed come from Trump. Correspondingly, the [/@RPMMAS](https://twitter.com/RPMMAS) twitter account performed an informal poll of its users and received almost 2000 responses, with 96% indicating they believed the tweet to have come from Trump:
113 | 
114 | ![WH claims his lawyer wrote this tweet: "I had to fire General Flynn because he lied to the Vice President and the FBI. He has pled guilty to those lies. It is a shame because his actions during the transition were lawful. There was nothing to hide!" Do you believe that's true?](images/flynn_tweet_poll.png)
115 | 
116 | A word of caution though: not all of my models individually agreed on this one. Specifically, AdaBoost, KNN, and SVM indicated that it is a non-Trump tweet. Random Forest, Naive Bayes, and Logistic Regression all output Trump as the author. In my opinion, after reviewing thousands of Trump tweets throughout this project and evaluating all features which describe his tweets, I find the topic, sentiment, and emotion very much to be Trumpian, while the phrasing, grammar, and style all indicate another author. I believe the tweet was written collaboratively, with Trump providing the topical features of the tweet and an unknown author actually composing it.
117 | 
118 | 
119 | ## Sources
120 | 
121 | *Many thanks to the following packages and lexicons!*
122 | 
123 | Trump's tweet data is from Brendan Brown's [Trump Tweet Data Archive](https://github.com/bpb27/trump_tweet_data_archive)
124 | 
125 | Trump aide data was scraped from Twitter using Ahmet Taspinar's [twitterscraper](https://github.com/taspinar/twitterscraper) with the query "twitterscraper 'from:twitter_handle since:2009-01-01 until:2017-12-31' -o scraped_tweets.json"
126 | 
127 | VADER sentiment analysis was performed using [C.J. Hutto's VADER package](https://github.com/cjhutto/vaderSentiment)
128 | 
129 | The National Research Council of Canada kindly gave me access to the [NRC Word-Emotion Association Lexicon](http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm). Contact: Saif Mohammad (saif.mohammad@nrc-cnrc.gc.ca)
130 | 
131 | Lastly, I used Jared Suttles' [Tweetokenize](https://github.com/jaredks/tweetokenize) to aid in my part-of-speech analysis. An updated version of the package which works with Python 3 can be found in my fork [here](https://github.com/raffg/tweetokenize/tree/Python-3).
132 | 
133 | 
134 | ## Further Reading
135 | 
136 | * [Text analysis of Trump's tweets confirms he writes only the (angrier) Android half](http://varianceexplained.org/r/trump-tweets/)
137 | * [How to tell when someone else tweets from @realDonaldTrump](https://www.wired.com/story/tell-when-someone-else-tweets-from-realdonaldtrump/)
138 | * [All the president’s tweets: Fox News enjoys considerable influence over the world’s most important Twitter account](https://www.economist.com/blogs/graphicdetail/2018/01/daily-chart-9)
139 | * [Is Trump's Campaign Locking Him Out of Twitter?](https://www.theatlantic.com/politics/archive/2016/08/donald-trump-twitter-iphone-android/495239/)
140 | * [Timestamp analysis confirms Trump is the author of Android tweets](http://didtrumptweetit.com/timestamp-analysis-trump-android-phone/)
141 | 


--------------------------------------------------------------------------------
/TweetAuthorshipPredictor.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pickle
  4 | from src.ridge_grid_scan import ridge_grid_scan
  5 | from src.feature_pipeline import feature_pipeline
  6 | from src.load_data import load_json_list, apply_date_mask
  7 | from sklearn.feature_extraction.text import TfidfVectorizer
  8 | from sklearn.preprocessing import StandardScaler
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.decomposition import PCA
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from sklearn.ensemble import AdaBoostClassifier
 13 | from sklearn.ensemble import GradientBoostingClassifier
 14 | from sklearn.neighbors import KNeighborsClassifier
 15 | from sklearn.naive_bayes import MultinomialNB
 16 | from sklearn.naive_bayes import GaussianNB
 17 | from sklearn.svm import SVC
 18 | from sklearn.linear_model import SGDClassifier
 19 | from sklearn.linear_model import LogisticRegression
 20 | from sklearn.metrics import accuracy_score, precision_score, recall_score, \
 21 |                             f1_score
 22 | 
 23 | 
 24 | def main():
 25 |     with open('labeled_data_through_mar_11.pkl', 'rb') as f:
 26 |         df = pickle.load(f)
 27 | 
 28 |     print('Loading data...')
 29 |     df = apply_date_mask(df, 'created_at', '2009-01-01', '2018-12-31')
 30 | 
 31 |     y = pd.DataFrame(np.where(df['label'] == 1, 1, 0))
 32 |     X = df.drop(['label'], axis=1)
 33 |     save_pickle(y, 'ensemble/y_train.pkl')
 34 | 
 35 |     trump = TweetAuthorshipPredictor()
 36 |     trump.fit(X, y)
 37 | 
 38 |     save_pickle(trump, 'ensemble/trump.pkl')
 39 | 
 40 | 
 41 | class TweetAuthorshipPredictor(object):
 42 |     ''' This class represents the ensemble of models for tweet authorship
 43 |     prediction
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     featurized: boolean, option (default=False)
 48 |         Boolean indicating if the X data has already been featurized.
 49 |         Use True if sending featurized data to the class.
 50 | 
 51 |     Methods
 52 |     -------
 53 |     fit : fit the model to X and y data
 54 |     predict : predict the authoriship of an unlabled tweet_length
 55 |     get_top_features : returns a list of the features ordered by influence
 56 | 
 57 |     Attributes
 58 |     ----------
 59 |     top_feats : Array of the features sorted by influence
 60 | 
 61 |     Returns
 62 |     -------
 63 |     self:
 64 |         The initialized GradientDescent object.
 65 |     '''
 66 | 
 67 |     def __init__(self):
 68 |         ''' Initialize the ensemble object
 69 |         '''
 70 |         # Save the individual ensemble models
 71 |         self.rf = None
 72 |         self.ab = None
 73 |         self.gb = None
 74 |         self.knn = None
 75 |         self.nb = None
 76 |         self.gnb = None
 77 |         self.svc = None
 78 |         self.svm = None
 79 |         self.lr = None
 80 |         self.ridge = None
 81 | 
 82 |         # Save the data processing objects
 83 |         self.top_feats = None
 84 |         self.scaler = None
 85 |         self.knn_pca = None
 86 |         self.gnb_pca = None
 87 |         self.tfidf_text = None
 88 |         self.tfidf_ner = None
 89 |         self.tfidf_pos = None
 90 | 
 91 |         # Columns to standardize
 92 |         self.std = ['compound', 'anger', 'anticipation', 'disgust', 'fear',
 93 |                     'joy', 'negative', 'positive', 'sadness', 'surprise',
 94 |                     'trust', 'avg_sentence_length', 'avg_word_length',
 95 |                     'commas', 'semicolons', 'exclamations', 'periods',
 96 |                     'questions', 'quotes', 'ellipses', 'mentions',
 97 |                     'hashtags', 'urls', 'all_caps', 'random_caps']
 98 | 
 99 |         # Columns to train on prior to tf-idf
100 |         self.feat = ['created_at', 'is_retweet', 'text', 'is_reply',
101 |                      'compound', 'v_negative', 'v_neutral', 'v_positive',
102 |                      'anger', 'anticipation', 'disgust', 'fear', 'joy',
103 |                      'negative', 'positive', 'sadness', 'surprise', 'trust',
104 |                      'avg_sentence_length', 'avg_word_length', 'commas',
105 |                      'semicolons', 'exclamations', 'periods', 'questions',
106 |                      'quotes', 'ellipses', 'mentions', 'hashtags', 'urls',
107 |                      'is_quoted_retweet', 'all_caps', 'hour_20_02',
108 |                      'hour_14_20', 'hour_08_14', 'hour_02_08', 'weekend',
109 |                      'random_caps', 'start_mention', 'ner', 'pos']
110 | 
111 |         # tf-idf column names
112 |         self.text_cols = None
113 |         self.pos_cols = None
114 |         self.ner_cols = None
115 | 
116 |         # Set the number of features for each model
117 |         self.rf_feats = 200
118 |         self.ab_feats = 300
119 |         self.gb_feats = 300
120 |         self.knn_feats = 13
121 |         self.nb_feats = 5
122 |         self.gnb_feats = 13
123 |         self.svc_feats = 50
124 |         self.svm_feats = 300
125 |         self.lr_feats = 200
126 | 
127 |     def fit(self, X_train, y_train):
128 |         ''' Train the ensemble with X and y data
129 | 
130 |         Parameters
131 |         ----------
132 |         X: Pandas DataFrame, shape (n_samples, n_features)
133 |             The training data.
134 |         y: Pandas DataFrame, shape (n_samples, ).
135 |             The training response for the optimization.
136 | 
137 |         Returns
138 |         -------
139 |         self:
140 |             The fit Ensemble object.
141 |         '''
142 |         # Featurize the X data
143 |         X_train, X_std_train = self._prepare_data_for_fit(X_train)
144 | 
145 |         save_pickle(X_train, 'ensemble/X_train.pkl')
146 |         save_pickle(X_std_train, 'ensemble/X_std_train.pkl')
147 |         # X_train = load_pickle('twitterbot_pickles/X_train.pkl')
148 |         # X_std_train = load_pickle('twitterbot_pickles/X_std_train.pkl')
149 | 
150 |         drop = ['created_at', 'text', 'pos', 'ner']
151 | 
152 |         # self.tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl')
153 |         # self.tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl')
154 |         # self.tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl')
155 |         # self.scaler = load_pickle('twitterbot_pickles/scaler.pkl')
156 |         self.text_cols = self.tfidf_text.get_feature_names()
157 |         self.ner_cols = self.tfidf_ner.get_feature_names()
158 |         self.pos_cols = self.tfidf_pos.get_feature_names()
159 | 
160 |         # Remove non-numeric features
161 |         X_train = X_train.drop(drop, axis=1)
162 |         X_std_train = X_std_train.drop(drop, axis=1)
163 | 
164 |         # Load the feature sets
165 |         feature_list = ridge_grid_scan(X_train,
166 |                                        np.array(y_train).ravel(),
167 |                                        n=len(X_train.columns))
168 |         self.top_feats = [(x[0]) for x in list(feature_list)]
169 |         save_pickle(self.top_feats, 'ensemble/top_feats.pkl')
170 |         # self.top_feats = load_pickle('twitterbot_pickles/top_feats.pkl')
171 | 
172 |         # Train the PCA objects
173 |         self._gnb_pca_calc(X_std_train[self.top_feats[:13]])
174 |         self._knn_pca_calc(X_std_train[self.top_feats[:13]])
175 |         save_pickle(self.gnb_pca, 'ensemble/gnb_pca.pkl')
176 |         save_pickle(self.knn_pca, 'ensemble/knn_pca.pkl')
177 |         # self.gnb_pca = load_pickle('twitterbot_pickles/gnb_pca.pkl')
178 |         # self.knn_pca = load_pickle('twitterbot_pickles/knn_pca.pkl')
179 | 
180 |         # Train the individual models
181 |         data, probabilities = self._first_stage_train(X_train, X_std_train,
182 |                                                       np.array(y_train).
183 |                                                       ravel())
184 |         X_train_ridge = pd.DataFrame(probabilities)
185 | 
186 |         save_pickle(X_train_ridge, 'ensemble/X_train_ridge.pkl')
187 |         # X_train_ridge = load_pickle('twitterbot_pickles/X_train_ridge.pkl')
188 | 
189 |         self.ridge = self._ridge(X_train_ridge, np.array(y_train).ravel())
190 | 
191 |         return self
192 | 
193 |     def predict(self, X):
194 |         '''Return a label for prediction of the authoriship of the tweet X
195 | 
196 |         Parameters
197 |         ----------
198 |         X: 2d Pandas DataFrame
199 |             The feature matrix
200 | 
201 |         Returns
202 |         -------
203 |         y: (1 or 0, probability)
204 |             Predicted label, probabilities
205 |         '''
206 |         X, X_std = self._prepare_data_for_predict(X)
207 |         data, probabilities = self._first_stage_predict(X, X_std)
208 |         X_ridge = pd.DataFrame(probabilities)
209 | 
210 |         prediction = self.ridge.predict(X_ridge)
211 |         proba_list = []
212 |         for key, value in probabilities.items():
213 |             if data[key] == prediction:
214 |                 proba_list.append(probabilities[key])
215 |         proba = np.mean(proba_list)
216 | 
217 |         return prediction, proba
218 | 
219 |     def get_top_features(self):
220 |         '''Returns a list of the features ordered by influence
221 |         '''
222 |         return self.top_feats
223 | 
224 |     def _standard_scaler(self, X):
225 |         # Standardize features
226 |         print('Calculating standardization')
227 |         self.scaler = StandardScaler()
228 |         cols = X.columns
229 |         self.scaler.fit(X)
230 |         save_pickle(self.scaler, 'ensemble/scaler.pkl')
231 |         # self.scaler = load_pickle('twitterbot_pickles/scaler.pkl')
232 | 
233 |     def _standardize(self, X):
234 |         X_std = X.copy()
235 |         cols = X[self.std].columns
236 |         X_std[self.std] = pd.DataFrame(self.scaler.transform(
237 |                                        X[self.std]),
238 |                                        index=X.index,
239 |                                        columns=cols)
240 |         return X_std
241 | 
242 |     def _prepare_data_for_fit(self, X):
243 |         ''' Processes the X data with all features, saves tf-idf vectorizers,
244 |         and standardizes.
245 |         '''
246 |         # Create new feature columns
247 |         # X = feature_pipeline(X, verbose=True)
248 |         # save_pickle(X, 'ensemble/X.pkl')
249 |         X = load_pickle('X.pkl')
250 |         X = apply_date_mask(X, 'created_at', '2009-01-01', '2018-12-31')
251 | 
252 |         X = self._tfidf_fit_transform(X[self.feat])
253 |         self._standard_scaler(X[self.std])
254 |         X_std = self._standardize(X)
255 | 
256 |         return X, X_std
257 | 
258 |     def _prepare_data_for_predict(self, X):
259 |         ''' Processes the X data with all features and standardizes.
260 |         '''
261 |         # Create new feature columns
262 |         X = feature_pipeline(X)
263 |         X = self._tfidf_transform(X[self.feat])
264 |         X_std = self._standardize(X)
265 | 
266 |         return X, X_std
267 | 
268 |     def _first_stage_train(self, X_train, X_std_train, y_train):
269 |         '''Train models in first stage of 9 models
270 |         '''
271 |         rf_feat = self.top_feats[:self.rf_feats]
272 |         ab_feat = self.top_feats[:self.ab_feats]
273 |         gb_feat = self.top_feats[:self.gb_feats]
274 |         knn_feat = self.top_feats[:self.knn_feats]
275 |         nb_feat = self.top_feats[:self.nb_feats]
276 |         gnb_feat = self.top_feats[:self.gnb_feats]
277 |         svc_feat = self.top_feats[:self.svc_feats]
278 |         svm_feat = self.top_feats[:self.svm_feats]
279 |         lr_feat = self.top_feats[:self.lr_feats]
280 | 
281 |         rf_results = self._random_forest(X_train[rf_feat], y_train)
282 |         ab_results = self._adaboost(X_std_train[ab_feat], y_train)
283 |         gb_results = self._gradient_boosting(X_std_train[gb_feat], y_train)
284 |         knn_results = self._knn(X_std_train[knn_feat], y_train)
285 |         nb_results = self._naive_bayes(X_train[nb_feat], y_train)
286 |         gnb_results = self._gaussian_naive_bayes(X_std_train[gnb_feat],
287 |                                                  y_train)
288 |         svc_results = self._svc(X_std_train[svc_feat], y_train)
289 |         svm_results = self._svm(X_std_train[svm_feat], y_train)
290 |         lr_results = self._logistic_regression(X_std_train[lr_feat], y_train)
291 | 
292 |         data = {'rf': rf_results[0], 'ab': ab_results[0],
293 |                 'gb': gb_results[0], 'knn': knn_results[0],
294 |                 'nb': nb_results[0], 'gnb': gnb_results[0],
295 |                 'svc': svc_results[0], 'svm': svm_results[0],
296 |                 'lr': lr_results[0]}
297 | 
298 |         probabilities = {'rf': rf_results[1], 'ab': ab_results[1],
299 |                          'gb': gb_results[1], 'knn': knn_results[1],
300 |                          'nb': nb_results[1], 'gnb': gnb_results[1],
301 |                          'svc': svc_results[1], 'svm': svm_results[1],
302 |                          'lr': lr_results[1]}
303 | 
304 |         for key, value in probabilities.items():
305 |             probabilities[key] = [item[1] for item in probabilities[key]]
306 | 
307 |         return data, probabilities
308 | 
309 |     def _first_stage_predict(self, X, X_std):
310 |         '''Calculate predictions for first stage of 9 models
311 |         '''
312 |         rf_feat = self.top_feats[:self.rf_feats]
313 |         ab_feat = self.top_feats[:self.ab_feats]
314 |         gb_feat = self.top_feats[:self.gb_feats]
315 |         knn_feat = self.top_feats[:self.knn_feats]
316 |         nb_feat = self.top_feats[:self.nb_feats]
317 |         gnb_feat = self.top_feats[:self.gnb_feats]
318 |         svc_feat = self.top_feats[:self.svc_feats]
319 |         svm_feat = self.top_feats[:self.svm_feats]
320 |         lr_feat = self.top_feats[:self.lr_feats]
321 | 
322 |         X_knn = self.knn_pca.transform(X_std[knn_feat])
323 |         X_gnb = self.gnb_pca.transform(X_std[gnb_feat])
324 | 
325 |         rf_results = (self.rf.predict(X[rf_feat]),
326 |                       self.rf.predict_proba(X[rf_feat]))
327 |         ab_results = (self.ab.predict(X_std[ab_feat]),
328 |                       self.ab.predict_proba(X_std[ab_feat]))
329 |         gb_results = (self.gb.predict(X_std[gb_feat]),
330 |                       self.gb.predict_proba(X_std[gb_feat]))
331 |         knn_results = (self.knn.predict(X_knn),
332 |                        self.knn.predict_proba(X_knn))
333 |         nb_results = (self.nb.predict(X[nb_feat]),
334 |                       self.nb.predict_proba(X[nb_feat]))
335 |         gnb_results = (self.gnb.predict(X_gnb),
336 |                        self.gnb.predict_proba(X_gnb))
337 |         svc_results = (self.svc.predict(X_std[svc_feat]),
338 |                        self.svc.predict_proba(X_std[svc_feat]))
339 |         svm_results = (self.svm.predict(X_std[svm_feat]),
340 |                        self.svm.predict_proba(X_std[svm_feat]))
341 |         lr_results = (self.lr.predict(X_std[lr_feat]),
342 |                       self.lr.predict_proba(X_std[lr_feat]))
343 | 
344 |         data = {'rf': rf_results[0], 'ab': ab_results[0],
345 |                 'gb': gb_results[0], 'knn': knn_results[0],
346 |                 'nb': nb_results[0], 'gnb': gnb_results[0],
347 |                 'svc': svc_results[0], 'svm': svm_results[0],
348 |                 'lr': lr_results[0]}
349 | 
350 |         probabilities = {'rf': rf_results[1], 'ab': ab_results[1],
351 |                          'gb': gb_results[1], 'knn': knn_results[1],
352 |                          'nb': nb_results[1], 'gnb': gnb_results[1],
353 |                          'svc': svc_results[1], 'svm': svm_results[1],
354 |                          'lr': lr_results[1]}
355 | 
356 |         for key, value in probabilities.items():
357 |             probabilities[key] = [item[1] for item in probabilities[key]]
358 | 
359 |         for key, value in probabilities.items():
360 |             print(key, value)
361 | 
362 |         for key, value in data.items():
363 |             print(key, value)
364 | 
365 |         return data, probabilities
366 | 
367 |     def _random_forest(self, X_train, y_train):
368 |         print('Running Random Forest')
369 |         rf = RandomForestClassifier(max_depth=20,
370 |                                     max_features='sqrt',
371 |                                     max_leaf_nodes=None,
372 |                                     min_samples_leaf=2,
373 |                                     min_samples_split=2,
374 |                                     n_estimators=1000,
375 |                                     n_jobs=-1).fit(X_train, y_train)
376 |         save_pickle(rf, 'ensemble/rf.pkl')
377 |         # rf = load_pickle('twitterbot_pickles/rf.pkl')
378 |         predicted = rf.predict(X_train)
379 |         proba = rf.predict_proba(X_train)
380 |         self.rf = rf
381 |         return predicted, proba
382 | 
383 |     def _adaboost(self, X_train, y_train):
384 |         print('Running AdaBoost')
385 |         ab = AdaBoostClassifier(learning_rate=1.25,
386 |                                 n_estimators=40).fit(X_train, y_train)
387 |         save_pickle(ab, 'ensemble/ab.pkl')
388 |         # ab = load_pickle('twitterbot_pickles/ab.pkl')
389 |         predicted = ab.predict(X_train)
390 |         proba = ab.predict_proba(X_train)
391 |         self.ab = ab
392 |         return predicted, proba
393 | 
394 |     def _gradient_boosting(self, X_train, y_train):
395 |         print('Running Gradient Boosting')
396 |         gb = GradientBoostingClassifier(n_estimators=200,
397 |                                         learning_rate=.1,
398 |                                         max_depth=6,
399 |                                         min_samples_split=2,
400 |                                         min_samples_leaf=1,
401 |                                         subsample=1,
402 |                                         max_features=None
403 |                                         ).fit(X_train, y_train)
404 |         save_pickle(gb, 'ensemble/gb.pkl')
405 |         # gb = load_pickle('twitterbot_pickles/gb.pkl')
406 |         predicted = gb.predict(X_train)
407 |         proba = gb.predict_proba(X_train)
408 |         self.gb = gb
409 |         return predicted, proba
410 | 
411 |     def _knn(self, X_train, y_train):
412 |         print('Running K Nearest Neighbors')
413 |         X_train = self.knn_pca.transform(X_train)
414 |         knn = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
415 |         save_pickle(knn, 'ensemble/knn.pkl')
416 |         # knn = load_pickle('twitterbot_pickles/knn.pkl')
417 |         predicted = knn.predict(X_train)
418 |         proba = knn.predict_proba(X_train)
419 |         self.knn = knn
420 |         return predicted, proba
421 | 
422 |     def _knn_pca_calc(self, X_train):
423 |         # Perform Principle Component Analysis
424 |         print('Performing PCA on K Nearest Neighbors')
425 |         pca = PCA(n_components=12)
426 |         pca.fit(X_train)
427 |         self.knn_pca = pca
428 | 
429 |     def _naive_bayes(self, X_train, y_train):
430 |         print('Running Multinomial Naive Bayes')
431 |         nb = MultinomialNB(alpha=10).fit(X_train, y_train)
432 |         save_pickle(nb, 'ensemble/nb.pkl')
433 |         # nb = load_pickle('twitterbot_pickles/nb.pkl')
434 |         predicted = nb.predict(X_train)
435 |         proba = nb.predict_proba(X_train)
436 |         self.nb = nb
437 |         return predicted, proba
438 | 
439 |     def _gaussian_naive_bayes(self, X_train, y_train):
440 |         print('Running Gaussian Naive Bayes')
441 |         X_train = self.gnb_pca.transform(X_train)
442 |         gnb = GaussianNB().fit(X_train, y_train)
443 |         save_pickle(gnb, 'ensemble/gnb.pkl')
444 |         # gnb = load_pickle('twitterbot_pickles/gnb.pkl')
445 |         predicted = gnb.predict(X_train)
446 |         proba = gnb.predict_proba(X_train)
447 |         self.gnb = gnb
448 |         return predicted, proba
449 | 
450 |     def _gnb_pca_calc(self, X_train):
451 |         # Perform Principle Component Analysis
452 |         print('Performing PCA on Gaussian Naive Bayes')
453 |         pca = PCA(n_components=10)
454 |         pca.fit(X_train)
455 |         self.gnb_pca = pca
456 | 
457 |     def _svc(self, X_train, y_train):
458 |         print('Running Support Vector Classifier')
459 |         svc = SVC(C=100,
460 |                   coef0=1,
461 |                   degree=2,
462 |                   gamma='auto',
463 |                   kernel='poly',
464 |                   shrinking=False,
465 |                   probability=True).fit(X_train, y_train)
466 |         save_pickle(svc, 'ensemble/svc.pkl')
467 |         # svc = load_pickle('twitterbot_pickles/svc.pkl')
468 |         predicted = svc.predict(X_train)
469 |         proba = svc.predict_proba(X_train)
470 |         self.svc = svc
471 |         return predicted, proba
472 | 
473 |     def _svm(self, X_train, y_train):
474 |         print('Running Support Vector Machine')
475 |         svm = SGDClassifier(loss='modified_huber', penalty='l2',
476 |                             alpha=0.0001, max_iter=10).fit(X_train, y_train)
477 |         save_pickle(svm, 'ensemble/svm.pkl')
478 |         # svm = load_pickle('twitterbot_pickles/svm.pkl')
479 |         predicted = svm.predict(X_train)
480 |         proba = svm.predict_proba(X_train)
481 |         self.svm = svm
482 |         return predicted, proba
483 | 
484 |     def _logistic_regression(self, X_train, y_train):
485 |         print('Running Logistic Regression')
486 |         lr = LogisticRegression(C=.05).fit(X_train, y_train)
487 |         save_pickle(lr, 'ensemble/lr.pkl')
488 |         # lr = load_pickle('twitterbot_pickles/lr.pkl')
489 |         predicted = lr.predict(X_train)
490 |         proba = lr.predict_proba(X_train)
491 |         self.lr = lr
492 |         return predicted, proba
493 | 
494 |     def _ridge(self, X_train, y_train):
495 |         print('Running Ridge Regression')
496 |         ridge = LogisticRegression(penalty='l2', C=10000000)
497 |         save_pickle(ridge, 'ensemble/ridge.pkl')
498 |         # ridge = load_pickle('twitterbot_pickles/ridge.pkl')
499 |         ridge.fit(X_train, y_train)
500 |         self.ridge = ridge
501 |         return ridge
502 | 
503 |     def _tfidf_fit_transform(self, X):
504 |         '''Fits and concatenates tf-idf columns to X for text, pos, and ner
505 |         '''
506 |         print('Calculating TF-IDF')
507 |         # Perform TF-IDF on text column
508 |         print('  on text column')
509 |         self.tfidf_text = TfidfVectorizer(ngram_range=(1, 2),
510 |                                           lowercase=False,
511 |                                           token_pattern='\w+|\@\w+',
512 |                                           norm='l2',
513 |                                           max_df=.99,
514 |                                           min_df=.01)
515 |         tfidf_text = self.tfidf_text.fit_transform(X['text'])
516 |         self.text_cols = self.tfidf_text.get_feature_names()
517 |         idx = X.index
518 |         tfidf_text = pd.DataFrame(tfidf_text.todense(),
519 |                                   columns=[self.text_cols],
520 |                                   index=idx)
521 |         save_pickle(self.tfidf_text, 'ensemble/tfidf_text.pkl')
522 |         # self.tfidf_text = load_pickle('twitterbot_pickles/tfidf_text.pkl')
523 | 
524 |         # Perform TF-IDF on ner column
525 |         print('  on ner column')
526 |         self.tfidf_ner = TfidfVectorizer(ngram_range=(1, 2),
527 |                                          lowercase=False,
528 |                                          norm='l2',
529 |                                          max_df=.99,
530 |                                          min_df=.01)
531 |         tfidf_ner = self.tfidf_ner.fit_transform(X['ner'])
532 |         self.ner_cols = self.tfidf_ner.get_feature_names()
533 |         tfidf_ner = pd.DataFrame(tfidf_ner.todense(),
534 |                                  columns=[self.ner_cols],
535 |                                  index=idx)
536 |         save_pickle(self.tfidf_ner, 'ensemble/tfidf_ner.pkl')
537 |         # self.tfidf_ner = load_pickle('twitterbot_pickles/tfidf_ner.pkl')
538 | 
539 |         # Perform TF-IDF on pos column
540 |         print('  on pos column')
541 |         self.tfidf_pos = TfidfVectorizer(ngram_range=(2, 3),
542 |                                          lowercase=False,
543 |                                          norm='l2',
544 |                                          max_df=.99,
545 |                                          min_df=.01)
546 |         tfidf_pos = self.tfidf_pos.fit_transform(X['pos'])
547 |         self.pos_cols = self.tfidf_pos.get_feature_names()
548 |         tfidf_pos = pd.DataFrame(tfidf_pos.todense(),
549 |                                  columns=[self.pos_cols],
550 |                                  index=idx)
551 |         save_pickle(self.tfidf_pos, 'ensemble/tfidf_pos.pkl')
552 |         # self.tfidf_pos = load_pickle('twitterbot_pickles/tfidf_pos.pkl')
553 | 
554 |         X = self._tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner)
555 | 
556 |         return X
557 | 
558 |     def _tfidf_transform(self, X):
559 |         '''Performs a tf-idf transform on the given column of data
560 |         '''
561 |         X.reset_index(drop=True, inplace=True)
562 |         tfidf_text = self.tfidf_text.transform(X['text'])
563 |         tfidf_text = pd.DataFrame(tfidf_text.todense(),
564 |                                   columns=[self.text_cols])
565 | 
566 |         tfidf_ner = self.tfidf_ner.transform(X['ner'])
567 |         tfidf_ner = pd.DataFrame(tfidf_ner.todense(),
568 |                                  columns=[self.ner_cols])
569 | 
570 |         tfidf_pos = self.tfidf_pos.transform(X['pos'])
571 |         tfidf_pos = pd.DataFrame(tfidf_pos.todense(),
572 |                                  columns=[self.pos_cols])
573 | 
574 |         X = self._tfidf_remove_dups(X, tfidf_text, tfidf_pos, tfidf_ner)
575 | 
576 |         return X
577 | 
578 |     def _tfidf_remove_dups(self, X, tfidf_text, tfidf_pos, tfidf_ner):
579 |         '''Removes columns in tfidf_pos and tfidf_ner that are duplicates from
580 |         tfidf_text, and concatentates the DataFrames
581 |         '''
582 |         # Drop ner columns also present in tfidf_text
583 |         columns_to_keep = [x for x in tfidf_ner
584 |                            if x not in tfidf_text]
585 |         tfidf_ner = tfidf_ner[columns_to_keep]
586 | 
587 |         # Drop pos columns also present in ner
588 |         columns_to_keep = [x for x in tfidf_pos
589 |                            if x not in tfidf_ner]
590 |         tfidf_pos = tfidf_pos[columns_to_keep]
591 | 
592 |         X = pd.concat([X, tfidf_text, tfidf_pos, tfidf_ner], axis=1)
593 |         return X
594 | 
595 | 
596 | def save_pickle(item, filename):
597 |     # Save pickle file
598 |     output = open(filename, 'wb')
599 |     print('     Pickle dump', filename)
600 |     pickle.dump(item, output, protocol=4)
601 |     output.close()
602 | 
603 | 
604 | def load_pickle(filename):
605 |     # Open pickle filename
606 |     print('     Pickle load', filename)
607 |     with open(filename, 'rb') as f:
608 |         return pickle.load(f)
609 | 
610 | 
611 | if __name__ == '__main__':
612 |     main()
613 | 


--------------------------------------------------------------------------------