├── .env ├── .gitignore ├── Docker └── docker-compose.yml ├── README.md ├── requirements.txt └── src ├── Database ├── data_to_csv.py ├── database.py └── output │ └── litecoin_user_list.csv ├── LearningComponent ├── model.py └── preprocessing.py ├── Main └── main.py ├── Obj ├── tweet.py └── tweetlist.py ├── Scraper └── tweets_scraper.py ├── UIWidget └── widget.py └── Visualization └── plot.py /.env: -------------------------------------------------------------------------------- 1 | # This dotenv file is used to configure the base implementation of the TSAT 2 | # You MUST create your own twitter app on twitters developer website to obtain the api keys 3 | # When Running the Tool, go to localhost:5555 and use email: user and password: password 4 | # To view postgres database 5 | 6 | # API KEYS 7 | API='{Insert Twitter API Here}' 8 | API_SECRET='{Insert Twitter API Secret Here}' 9 | ACCESS_TOKEN='{Insert Twitter Access Token Here}' 10 | ACCESS_KEY='{Insert Twitter Access Key Here' 11 | 12 | # DB CONFIG (See README for advance implementation with docker) 13 | DB_NAME='user' 14 | DB_PASSWORD='password' 15 | DB_ADDRESS='127.0.0.1' 16 | DB_PORT='5432' 17 | 18 | # SENTIMENT CONFIG 19 | TABLE_NAME='{Insert Table Name Here}' 20 | KEYWORD='{Insert Keyword to Perform Tweet Analysis on Here}' 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | dotenv 3 | -------------------------------------------------------------------------------- /Docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # docker-compose up 2 | # go to local host 5555 3 | # login with user and password 4 | # create a db on the network: 172.--.0.1 5 | 6 | version: '3' 7 | services: 8 | postgres_db: 9 | image: "postgres" 10 | ports: 11 | - "5432:5432" 12 | environment: 13 | POSTGRES_USER: 'user' 14 | POSTGRES_PASSWORD: 'password' 15 | 16 | postgres_manager: 17 | image: "dpage/pgadmin4" 18 | ports: 19 | - "5555:80" 20 | environment: 21 | PGADMIN_DEFAULT_EMAIL: 'user' 22 | PGADMIN_DEFAULT_PASSWORD: 'password' 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TwitterSentimentAnalysisTool (TSAT) 2 | A highly adaptable tool for analyzing twitter data. 3 | Auto scrapes Tweets based on user keyword from the previous week and applies a naive bayes classifier to analyze sentiment. Comes prebuilt with a docker postgres server managed by pgadmin 4 | Includes graphical visuals to analyze and categorize data. 5 | 6 | **Basic Usage:** 7 | 1. Clone the project 8 | 2. Install docker and docker-compose (not listed in requirements.txt) 9 | 3. Install requirements.txt 10 | 4. In terminal navigate to the project and cd to Docker 11 | 5. sudo docker-compose up 12 | 6. Check localhost:5555 and sign in to postgres using 'user' and 'password' as email and password respectively 13 | 7. create a server with the login 'user' and 'password' and the network 172.XX.0.1 - Look at the terminal for the ip address to fill in the XX 14 | 8. Go to .env file and set environment variables (Assumes you created a twitter app) 15 | 9. Run main.py, classify tweets and view projections 16 | 17 | **Important References:** 18 | * https://arxiv.org/pdf/1811.07522.pdf 19 | * https://jakevdp.github.io/PythonDataScienceHandbook/05.05-naive-bayes.html 20 | * https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed 21 | * https://www.sciencedirect.com/science/article/pii/S1877050919302789 22 | * https://www.sciencedirect.com/science/article/pii/S2405844018332067 23 | * https://arxiv.org/pdf/1509.02971.pdf 24 | 25 | **Advanced Usage:** 26 | The tool consists of a series of classes for managing and classifying twitterdata documented here: 27 | 28 | **Tweet** 29 | * *Constructor:* * Initializes all the tweet properties to none. Properties include: {Text, User, Retweet_Count, Date, Favorite Count, Follower Count, Nlp Score, Given Score, Tokenized Text 30 | * *insert_custom_value(name, value):* * Allows user to add a custom tweet property 31 | * *add_{insert_property}(property):* * Allows users to update defined properties 32 | * *add_tweet_json(tweet)* * When referencing the twitter API, tweets are returned in Json. This parses the json into a tweet obj 33 | * *add_tweet(text,user, favorite_count, retweet_count, follower_count, date, nlp_score, given_score, tokenized_text):* * Allows the user to add multiple properties at once. None of the properties are required 34 | 35 | **TweetList** 36 | * *Constructor:* * Initializes the dict for storing counds a begins indexing 37 | * *insert_data(tweet)* * Inserts a tweet 38 | * *remove_{insert_property}(property):* * Removes any tweets in the list matching the proeprty 39 | * *generate_random_tweet_list(size):* * Returns a random subset of current tweet list (used to make test and training sets) 40 | 41 | **TweetScraper** 42 | * *Constructor(consumer_key, consumer_secret, access_token, access_key):* * Initializes conncetion with twitter api. Access token and key not required 43 | * *search(keyword, user, start_date, end_date):* * Searches twitter and returns a tweet list. Keyword is the only reqauired value. Combinations of {keyword, user},{keyword, user, start_date, end_date}, and {keyword,end_date} are supported 44 | * *get_weekly_tweets(keyword):* * Gets tweets containing the keyword from the past week 45 | * *list_members(user, slug):* * Gets members of a list 46 | * *get_valuable_users(base_user):* * Gets all members of all base_users lists 47 | * *users_lists(user):* * Gets user lists 48 | 49 | **Data_To_Csv (Not used in Default Main)** 50 | * *Constructor:* * Does nothing 51 | * *write_data_to_csv(tweet_list, file_name):* * Writes tweets to a csv file at the file_name path. File_name defaults to src/Database/output/tweetlist.csv 52 | * *read_data_from_csv(file_name):* * Reads tweets from a csv file and parses into a TweetList obj 53 | * *write_userlist_to_csv(userlist, file_name):* * Writes a list of users to a csv file at the file_name path. File_name defaults to src/Database/output/userlist.csv 54 | * *read_userlist_from_csv(file_name):* * Reads a column of users from csv. Returns python list 55 | 56 | **Database** 57 | * *Constructor(user, password, host, port):* * Attempts to initalize db 58 | * *create_table(name, column_name, column_type):* * Creates a table in the db with all column names in the vector column_name and their corresponding column_type. The number of column names and types must be equivalent. 59 | * *num_rows(table_name, column_name):* * Returns the number of rows in a column 60 | * *update_column_by_id(table_name, column_name, tweet_id, new_value):* * Updates the specified column name with a new value any time the id is the tweet_id 61 | * *update_column_by_text(table_name, column_name, text, new_value):* * Updates the specified column name with a new value any time the text is equal to the text param 62 | * *insert_tweet(table_name, tweet_id, tweet):* * Inserts a specific tweet 63 | * *insert_tweet_list(table_name, tweet_list):* * Inserts all tweets in order of their storage in tweet_list 64 | * *parse_db_into_tweet_list(name):* * Parses all tweets in the named table into tweet objs and then stores them in a TweetList obj 65 | 66 | **PreProccessing** 67 | * *Constructor:* * Builds stop words 68 | * *process_tweets(list_tweets):* * Tokenizes a list of tweets and returns their associated tokenized array and label 69 | * *generate_token_array(token_arr):* * Transforms the multiarray tokenized list of tweets into single dim array for inserting into db 70 | 71 | **UserInterface** 72 | * *Constructor(data, db, db_name):* * Initializes Widget Values. Set data equal to training set to classify tweets 73 | * *Classify:* * Creates a ui for easily self-classifying tweets for the training set. Automatically updates DB\ 74 | 75 | **Plot** 76 | * *constructor(db):* * Sets inital values 77 | * *generate_projections(db_name, column_name):* * Gathers all manually classified tweet scores and assigns values 78 | * *build_projections_histogram:* * Displays a histogram of manually classified results 79 | * *create_classification_plot(pos_score, neg_score, neutral_score, irr_score):* * Displays a histogram of given params. Used to display results from NaiveBayes Classification Tool 80 | 81 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.7.1 2 | astor==0.8.0 3 | backports.weakref==1.0.post1 4 | certifi==2023.7.22 5 | chardet==3.0.4 6 | enum34==1.1.6 7 | funcsigs==1.0.2 8 | futures==3.2.0 9 | gast==0.2.2 10 | google-pasta==0.1.7 11 | grpcio==1.53.0 12 | h5py==2.9.0 13 | idna==2.8 14 | Keras==2.2.4 15 | Keras-Applications==1.0.8 16 | Keras-Preprocessing==1.1.0 17 | Markdown==3.1.1 18 | mock==3.0.5 19 | numpy==1.22.0 20 | oauthlib==3.0.2 21 | pandas==0.24.2 22 | protobuf==3.18.3 23 | pygame==1.9.6 24 | PySocks==1.7.0 25 | python-dateutil==2.8.0 26 | python-dotenv==0.10.3 27 | pytz==2019.1 28 | PyYAML==5.4 29 | requests==2.31.0 30 | requests-oauthlib==1.2.0 31 | scipy==1.10.0 32 | six==1.12.0 33 | tensorboard==1.14.0 34 | tensorflow-estimator==1.14.0 35 | tensorflow-gpu==2.12.0 36 | termcolor==1.1.0 37 | tweepy==3.8.0 38 | typing==3.7.4 39 | urllib3==1.26.17 40 | Werkzeug==2.2.3 41 | wrapt==1.11.2 42 | xlrd==1.2.0 43 | -------------------------------------------------------------------------------- /src/Database/data_to_csv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from src.Obj.tweet import Tweet 3 | from src.Obj.tweetlist import TweetList 4 | 5 | 6 | # Class CSV: Handles moving data to and from csv files. This is optional by user and is included if postgres is 7 | # not a preferred method of storage 8 | class CSV: 9 | # Write a list of tweets to a file 10 | # Takes in a data object holding a list of tweets 11 | # Default is tweetlist.csv in the output folder 12 | @staticmethod 13 | def write_data_to_csv(tweet_list, file_name=None): 14 | tweet_id, user, text, favorite_count, follower_count, retweet_count, date, nlp_score, \ 15 | given_score = [], [], [], [], [], [], [], [], [] 16 | for value in tweet_list.data: 17 | tweet_id.append(value) 18 | user.append(tweet_list.data[value].user) 19 | text.append(tweet_list.data[value].text) 20 | follower_count.append(tweet_list.data[value].follower_count) 21 | retweet_count.append(tweet_list.data[value].retweet_count) 22 | date.append(tweet_list.data[value].date) 23 | nlp_score.append(tweet_list.data[value].nlp_score) 24 | given_score.append(tweet_list.data[value].given_score) 25 | favorite_count.append(tweet_list.data[value].favorite_count) 26 | dictionary = {'id': tweet_id, 'text': text, 'user': user, 'favorite_count': favorite_count, 'retweet_count': 27 | retweet_count, 'follower_count': follower_count, 'date': date, 'nlp_score': nlp_score, 28 | 'given_score': given_score} 29 | df = pd.DataFrame(dictionary) 30 | if file_name is None: 31 | df.to_csv('src/Database/output/tweetlist.csv', index=False) 32 | else: 33 | df.to_csv(file_name, index=False) 34 | 35 | # Reads a list of tweet objects from a file 36 | # Returns a data object and takes in a file location of tweets 37 | # def read_data_from_csv(self, file_name): 38 | @staticmethod 39 | def read_data_from_csv(file_name): 40 | data = TweetList() 41 | df = pd.read_csv(file_name) 42 | for index, row in df.iterrows(): 43 | tweet = Tweet() 44 | tweet.add_tweet_noscore(row['text'], row['user'], row['favorite_count'], row['retweet_count'], 45 | row['follower_count'], row['date']) 46 | data.insert_data(tweet) 47 | return data 48 | 49 | # Writes an array of users to a csv file 50 | # Takes in an array and optionally a file_name to write to. 51 | # Default is userlist.csv in the output folder 52 | @staticmethod 53 | def write_userlist_to_csv(userlist, file_name=None): 54 | df = pd.DataFrame(userlist, columns=["Users"]) 55 | if file_name is None: 56 | df.to_csv('src/Database/output/userlist.csv', index=False, mode='a') 57 | else: 58 | df.to_csv(file_name, index=False, mode='a') 59 | 60 | # Reads a list of users into an array 61 | # Takes the file location of users 62 | @staticmethod 63 | def read_userlist_from_csv(file_name): 64 | # noinspection PyBroadException 65 | try: 66 | userlist = pd.read_csv(file_name) 67 | except Exception: 68 | raise Exception("File not found") 69 | return userlist.values.tolist() 70 | -------------------------------------------------------------------------------- /src/Database/database.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from src.Obj.tweet import Tweet 3 | from src.Obj.tweetlist import TweetList 4 | 5 | 6 | # Class Database: Adds and removes objects from postgres 7 | class Database: 8 | # constructor to initialize connection 9 | def __init__(self, user, password, host, port): 10 | self.connection = psycopg2.connect(user=user, password=password, host=host, port=port) 11 | self.cursor = self.connection.cursor() 12 | self.cursor.execute("SELECT version();") 13 | record = self.cursor.fetchone() 14 | print("You are connected to - ", record) 15 | 16 | # destructor to commit changes to database 17 | def __del__(self): 18 | self.connection.commit() 19 | self.connection.close() 20 | self.cursor.close() 21 | 22 | # Creates a table with name `name` consisting of a set of column names and 23 | # types. The number of column names and types must be equivalent. 24 | def create_table(self, name, column_name, column_type): 25 | if len(column_name) != len(column_type): 26 | raise ValueError("column names must match size of column types") 27 | table_command = "CREATE TABLE " + name + " (" 28 | for i in range(len(column_name)): 29 | table_command += column_name[i] + " " + column_type[i] + ", " 30 | table_command = table_command[:-2] 31 | table_command += ");" 32 | self.cursor.execute(table_command) 33 | 34 | # Returns number of rows in a specific column 35 | def num_rows(self, table_name, column_name): 36 | table_command = "SELECT COUNT(" + column_name + ") FROM " + table_name 37 | return self.cursor.execute(table_command) 38 | 39 | # updates a column based on id 40 | def update_column_by_id(self, table_name, column_name, tweet_id, new_value): 41 | table_command = "UPDATE " + table_name + " SET " + column_name + " = " + str(new_value) + \ 42 | " WHERE id = " + str(tweet_id) 43 | self.cursor.execute(table_command) 44 | 45 | def update_column_by_text(self, table_name, column_name, text, new_value): 46 | table_command = "UPDATE " + table_name + " SET " + column_name + " = '" + str(new_value) + \ 47 | "' WHERE text = " + "'{0}'".format(text) 48 | self.cursor.execute(table_command) 49 | 50 | # creates a new column and adds data in form of a data object to it into it 51 | def create_column(self, table_name, column_name, data, type_data): 52 | table_command = "ALTER TABLE " + table_name + " ADD COLUMN " + column_name + " " + type_data 53 | self.cursor.execute(table_command) 54 | self.insert_list(table_name, column_name, data) 55 | 56 | # deletes a row 57 | def delete_row(self, table_name, tweet_id): 58 | table_command = "DELETE FROM " + table_name + " WHERE id =" + str(tweet_id) 59 | self.cursor.execute(table_command) 60 | 61 | # gets column data and returns as list 62 | def get_column_data(self, table_name, column_name): 63 | table_command = "SELECT " + column_name + " FROM " + table_name 64 | self.cursor.execute(table_command) 65 | return self.cursor.fetchall() 66 | 67 | # gets row data and returns it as a tweet 68 | def get_row_data(self, table_name, tweet_id): 69 | table_command = "SELECT * FROM " + table_name + " WHERE id = " + str(tweet_id) 70 | self.cursor.execute(table_command) 71 | return self.cursor.fetchall() 72 | 73 | # inserts a tweet object into a table 74 | def insert_tweet(self, table_name, tweet_id, tweet): 75 | table_command = "INSERT into {0}" \ 76 | " VALUES ({1}, '{2}', '{3}', {4}, {5}, {6}, '{7}', '{8}', '{9}')".format(table_name, 77 | str(tweet_id), 78 | self.check_none( 79 | tweet.text).replace( 80 | "'", ""), 81 | self.check_none( 82 | tweet.user), 83 | self.check_none( 84 | tweet.retweet_count), 85 | self.check_none( 86 | tweet.favorite_count), 87 | self.check_none( 88 | tweet.follower_count), 89 | self.check_none( 90 | tweet.date), 91 | self.check_none( 92 | tweet.nlp_score), 93 | self.check_none( 94 | tweet.given_score)) 95 | self.cursor.execute(table_command) 96 | 97 | # inserts a list of tweet objects 98 | def insert_tweet_list(self, table_name, tweet_list): 99 | for value in tweet_list.data: 100 | self.insert_tweet(table_name, value, tweet_list.data[value]) 101 | 102 | # inserts a general list 103 | def insert_list(self, table_name, column_name, list): 104 | for value in list: 105 | # noinspection PyBroadException 106 | try: 107 | value = value.replace("'", "") 108 | except Exception: 109 | pass 110 | table_command = "INSERT into {0} ({1}) VALUES ('{2}')".format(table_name, column_name, value) 111 | self.cursor.execute(table_command) 112 | 113 | # helper method for insert_data 114 | @staticmethod 115 | def check_none(value): 116 | if value is None: 117 | return "-10" 118 | else: 119 | return value 120 | 121 | # commits to db (used for testing, optimally destructor will commit) 122 | def commit(self): 123 | self.connection.commit() 124 | 125 | # gets the number of columns of a table 126 | def get_num_of_columns(self, name): 127 | table_command = "SELECT COUNT(*) FROM " + name 128 | self.cursor.execute(table_command) 129 | return self.cursor.fetchone()[0] 130 | 131 | # parses the table of tweets back into a tweet_list obj usable in plots 132 | def parse_db_into_tweet_list(self, name): 133 | num_cols = self.get_num_of_columns(name) 134 | tweet_list = TweetList() 135 | for tweet_id in range(1, num_cols + 1): 136 | tweet = Tweet() 137 | unparsed_data = self.get_row_data(name, tweet_id) 138 | try: 139 | unparsed_data = unparsed_data[0] 140 | except IndexError: 141 | continue 142 | 143 | tweet.add_tweet(unparsed_data[1], unparsed_data[2], unparsed_data[3], unparsed_data[4], unparsed_data[5], 144 | unparsed_data[6], unparsed_data[7], unparsed_data[8], unparsed_data[9]) 145 | tweet_list.insert_data(tweet) 146 | return tweet_list 147 | -------------------------------------------------------------------------------- /src/LearningComponent/model.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | # Class Model: A model for classifying tweets using Naive Bayes 4 | class Model: 5 | # set inital classifier to none 6 | def __init__(self): 7 | self.word_features = None 8 | self.classifier = None 9 | 10 | # build a vocabulary of all tweets in training set 11 | def build_vocabulary(self, processed_multi_arr): 12 | vocab = [] 13 | 14 | for (words, sentiment) in processed_multi_arr: 15 | vocab.extend(words) 16 | 17 | word_list = nltk.FreqDist(vocab) 18 | self.word_features = word_list.keys() 19 | 20 | # create a count of each word 21 | def extract_features(self, tweet_text): 22 | features = {} 23 | for word in self.word_features: 24 | features['contains(%s)' % word] = (word in tweet_text) 25 | return features 26 | 27 | # build feature vector with relative labeling 28 | def build_feature_vector(self, tokenized_data): 29 | feature_vector = nltk.classify.apply_features(self.extract_features, tokenized_data) 30 | return feature_vector 31 | 32 | # trains the classifier 33 | def train_classifier(self, feature_vector): 34 | self.classifier = nltk.NaiveBayesClassifier.train(feature_vector) 35 | return self.classifier 36 | 37 | # tests the classifier on a tweet (helper method for classify_test_set) 38 | def test_classifier(self, tweet): 39 | test = self.classifier.classify(self.extract_features(tweet)) 40 | print(test) 41 | return test 42 | 43 | # classifies an entire test set 44 | def classify_test_set(self, test_set): 45 | return_labels = [self.test_classifier(test_set.data[index].tokenized_text) for index in test_set.data] 46 | return return_labels 47 | 48 | -------------------------------------------------------------------------------- /src/LearningComponent/preprocessing.py: -------------------------------------------------------------------------------- 1 | import re 2 | from nltk.tokenize import word_tokenize 3 | from string import punctuation 4 | from nltk.corpus import stopwords 5 | 6 | 7 | # Tokenization of Data before ML Classifier 8 | class PreProcessing: 9 | # create stop words set 10 | def __init__(self): 11 | self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER', 'URL']) 12 | 13 | # process each tweet in a tweet_list 14 | def process_tweets(self, list_tweets): 15 | processed_tweets = [] 16 | for index in list_tweets.data: 17 | processed_tweets.append( 18 | (self._process_tweet(list_tweets.data[index].text), list_tweets.data[index].given_score)) 19 | return processed_tweets 20 | 21 | # remove junk and tokenize 22 | def _process_tweet(self, tweet): 23 | tweet = tweet.lower() 24 | tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs 25 | tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames 26 | tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag 27 | tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello) 28 | return [word for word in tweet if word not in self._stopwords] 29 | 30 | @staticmethod 31 | def generate_token_array(token_arr): 32 | for index, value in enumerate(token_arr): 33 | string = "" 34 | for word in value[0]: 35 | string += word + "," 36 | token_arr[index] = string[:-1] 37 | return token_arr 38 | -------------------------------------------------------------------------------- /src/Main/main.py: -------------------------------------------------------------------------------- 1 | from src.Scraper.tweets_scraper import TweetScraper 2 | from src.Database.database import Database 3 | from src.UIWidget.widget import UserInterface 4 | from src.Obj.tweetlist import TweetList 5 | from src.LearningComponent.model import Model 6 | from src.LearningComponent.preprocessing import PreProcessing 7 | import os 8 | from src.Visualization.plot import Plot 9 | from dotenv import load_dotenv 10 | 11 | 12 | def main(): 13 | # Load Environment 14 | load_dotenv() 15 | 16 | # Connect To Twitter API 17 | print("Connecting to Twitter API...") 18 | try: 19 | api = TweetScraper(os.environ.get('API'), os.environ.get('API_SECRET'), os.environ.get('ACCESS_TOKEN'), 20 | os.environ.get('ACCESS_KEY')) 21 | except Exception: 22 | raise Exception("Unable to Connect to API") 23 | print("Connected to API") 24 | 25 | # Gather Tweets From Past Week 26 | print("Gathering Tweets....") 27 | try: 28 | tweets = api.get_weekly_tweets(os.environ.get("KEYWORD")) 29 | except Exception: 30 | raise Exception("Unable to gather tweets") 31 | 32 | # Connect to DB 33 | print("Establishing DB Connection") 34 | db = Database(os.environ.get('DB_NAME'), os.environ.get('DB_PASSWORD'), os.environ.get('DB_ADDRESS'), 35 | os.environ.get('DB_PORT')) 36 | 37 | # Create Table 38 | db_column_name = ['Id', 'Text', 'Username', 'Favorite_Count', 'Retweet_Count', 'Follower_Count', 'Date', 39 | 'Nlp_Score', 40 | 'Given_Score', 'tokenized_text'] 41 | db_column_type = ['INTEGER', 'VARCHAR', 'VARCHAR', 'INTEGER', 'INTEGER', 'INTEGER', 'VARCHAR', 'VARCHAR', 'INTEGER', 42 | 'VARCHAR'] 43 | table_name = os.environ.get('TABLE_NAME') 44 | print("Creating table " + str(table_name)) 45 | db.create_table(table_name, db_column_name, db_column_type) 46 | 47 | # Insert Tweets 48 | print("Inserting Tweets") 49 | try: 50 | db.insert_tweet_list(table_name, tweets) 51 | except Exception: 52 | raise Exception("Unable to insert tweets into Postgres DB") 53 | 54 | # Parse DB Into Tweets and Build Training Set 55 | tweet_list = db.parse_db_into_tweet_list(table_name) 56 | training_set_size = int(tweet_list.get_size()*.95) 57 | training_set = tweet_list.generate_random_tweet_list(training_set_size) 58 | 59 | # Create Tweet Classifier UI and Classify Training Set 60 | ui_widget = UserInterface(training_set, db, table_name) 61 | ui_widget.classify() 62 | 63 | # Visualize Classification Projections 64 | plot = Plot(db) 65 | plot.generate_projections(table_name, 'given_score') 66 | plot.build_projections_histogram(table_name) 67 | 68 | # PreProcess Data and Add Tokenized Text To DB: 69 | try: 70 | process = PreProcessing() 71 | token_data = process.generate_token_array(process.process_tweets(tweet_list)) 72 | for index, value in enumerate(token_data): 73 | db.update_column_by_text(table_name, 'tokenized_text', tweet_list.data[index+1].text, token_data[index]) 74 | except Exception: 75 | raise Exception("Unable to Tokenize Data") 76 | 77 | # ReParse DB to Get Test Set 78 | tweet_list = db.parse_db_into_tweet_list(table_name) 79 | training_set = TweetList() 80 | test_set = TweetList() 81 | for index in tweet_list.data: 82 | if tweet_list.data[index].given_score != -10: 83 | training_set.insert_data(tweet_list.data[index]) 84 | else: 85 | test_set.insert_data(test_set.insert_data(tweet_list.data[index])) 86 | 87 | # Build Vocabulary 88 | labeled_training_set = process.process_tweets(training_set) 89 | model = Model() 90 | model.build_vocabulary(labeled_training_set) 91 | 92 | # Build Feature Vector and Classify 93 | feature_vector = model.build_feature_vector(labeled_training_set) 94 | model.train_classifier(feature_vector) 95 | 96 | # Test on test set 97 | results = model.classify_test_set(test_set) 98 | plot.create_classification_plot(results.count(1), results.count(-1), results.count(0), results.count(100)) 99 | 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /src/Obj/tweet.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Tweet Class: This class creates fast variable access for holding various tweet 4 | # attributes as well as parsing tweets. 5 | class Tweet: 6 | # Default Constructor 7 | def __init__(self): 8 | self.text = None 9 | self.user = None 10 | self.retweet_count = None 11 | self.date = None 12 | self.favorite_count = None 13 | self.follower_count = None 14 | self.nlp_score = None 15 | self.given_score = None 16 | self.tokenized_text = None 17 | 18 | # Prints All Tweet Attributes 19 | def __str__(self): 20 | attributes = " " 21 | for attribute in self.__dict__: 22 | attributes += attribute + ": " + str(self.__dict__[attribute]) + " " 23 | return attributes[1:] 24 | 25 | # Insert Custom Attributes to class dict. Properties act as default properties 26 | def insert_custom_value(self, name, value): 27 | dictionary = {name: value} 28 | self.__dict__.update(dictionary) 29 | 30 | # Remove Custom Attribute 31 | def remove_custom_value(self, name): 32 | del self.__dict__[name] 33 | 34 | def add_nlp_score(self, score): 35 | self.nlp_score = score 36 | 37 | def add_given_score(self, score): 38 | self.given_score = score 39 | 40 | def add_user(self, user): 41 | self.user = user 42 | 43 | def add_text(self, text): 44 | self.text = text 45 | 46 | def add_creation_date(self, date): 47 | self.date = date 48 | 49 | def add_favorite_count(self, count): 50 | self.favorite_count = count 51 | 52 | def add_retweet_count(self, count): 53 | self.retweet_count = count 54 | 55 | def add_follower_count(self, count): 56 | self.follower_count = count 57 | 58 | def add_tokenized_text(self, text): 59 | self.tokenized_text = text 60 | 61 | # Parses json Obj to add tweet attributes 62 | def add_tweet_json(self, tweet): 63 | self.add_user(tweet.user.screen_name) 64 | self.add_creation_date(tweet.created_at) 65 | self.add_favorite_count(tweet.favorite_count) 66 | self.add_retweet_count(tweet.retweet_count) 67 | self.add_text(tweet.full_text) 68 | self.add_follower_count(tweet.user.followers_count) 69 | 70 | # adds tweet from inputs with scores 71 | def add_tweet(self, text=None, user=None, favorite_count=None, reteweet_count=None, follower_count=None, date=None, 72 | nlp_score=None, given_score=None, tokenized_text=None): 73 | if given_score is not None: 74 | self.add_given_score(given_score) 75 | if nlp_score is not None: 76 | self.add_nlp_score(nlp_score) 77 | self.add_user(user) 78 | self.add_creation_date(date) 79 | self.add_favorite_count(favorite_count) 80 | self.add_retweet_count(reteweet_count) 81 | self.add_text(text) 82 | self.add_follower_count(follower_count) 83 | self.add_tokenized_text(tokenized_text) -------------------------------------------------------------------------------- /src/Obj/tweetlist.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | # TweetList Class: This class holds a dictionary of tweets for quick access via index 5 | class TweetList: 6 | # Default Constructor 7 | def __init__(self): 8 | self.data = {} 9 | self.count = 1 10 | 11 | # Prints every tweet in dictionary 12 | def __str__(self): 13 | for attribute in self.data: 14 | print("ID: " + str(attribute), end=" ") 15 | print(self.data[attribute]) 16 | return "" 17 | 18 | # Adds a tweet entry and returns the new size of the list, for convinience. 19 | def insert_data(self, tweet): 20 | if tweet not in self.data.values(): 21 | self.data[self.count] = tweet 22 | self.count += 1 23 | return self.count 24 | 25 | # Removes the last tweet and returns the new size of the list, for 26 | # convenience. 27 | def remove_last(self): 28 | self.data = self.data[:-1] 29 | self.count -= 1 30 | return self.count 31 | 32 | def remove_index(self, index): 33 | del self.data[index] 34 | 35 | def remove_user(self, user): 36 | for key, value in self.data.items(): 37 | if key['user'] == user: 38 | del self.data[key] 39 | 40 | def remove_creation_date(self, date): 41 | for key, value in self.data.items(): 42 | if key['created_at'] == date: 43 | del self.data[key] 44 | 45 | def remove_tweet(self, text): 46 | for key, value in self.data.items(): 47 | if key['text'] == text: 48 | del self.data[key] 49 | 50 | def get_tweet(self, index): 51 | return self.data[index] 52 | 53 | def get_size(self): 54 | return len(self.data) 55 | 56 | def insert_list(self, tweet_list): 57 | for index in tweet_list.data: 58 | if tweet_list.data[index] not in self.data.values(): 59 | self.insert_data(tweet_list.data[index]) 60 | 61 | def __len__(self): 62 | return self.count 63 | 64 | # Generates a random subset of current list of users designated size 65 | def generate_random_tweet_list(self, size): 66 | values = random.sample(range(1, self.count), size) 67 | tweet_list = TweetList() 68 | for value in values: 69 | tweet_list.insert_data(self.data[value]) 70 | return tweet_list 71 | 72 | def get_last(self): 73 | return self.data[self.count-1] 74 | 75 | -------------------------------------------------------------------------------- /src/Scraper/tweets_scraper.py: -------------------------------------------------------------------------------- 1 | import tweepy 2 | from src.Obj.tweet import Tweet 3 | from src.Obj.tweetlist import TweetList 4 | from datetime import datetime 5 | from datetime import timedelta 6 | 7 | 8 | # Tweet Scrape Class: This class uses tweepy to handle twitter api's and builds tweets 9 | class TweetScraper: 10 | # connect to twitter api with account 11 | def __init__(self, consumer_key, consumer_secret, access_token=None, access_token_secret=None): 12 | auth = tweepy.OAuthHandler(consumer_key, consumer_secret) 13 | # does not use access token 14 | if access_token is None: 15 | self.api = tweepy.API(auth, wait_on_rate_limit=True) 16 | # uses required auth access tokens 17 | else: 18 | auth.set_access_token(access_token, access_token_secret) 19 | self.api = tweepy.API(auth, wait_on_rate_limit=True) 20 | 21 | # search with keyword, user, and date objects 22 | # Rate: 180 calls per window 23 | # Returns a tweet list 24 | def search(self, keyword, user=None, start_date=None, end_date=None): 25 | # search with just a keyword 26 | if user is None and start_date is None and end_date is None: 27 | query = keyword + " -RT" 28 | return self.parse_tweets(self.api.search(query, tweet_mode='extended', count=100)) 29 | 30 | # search with just keyword and user 31 | elif start_date is None and end_date is None: 32 | query = keyword + "(from:" + user + ")(to:" + user + ")" + " -RT" 33 | return self.parse_tweets(self.api.search(query, tweet_mode='extended')) 34 | 35 | # search with user, keyword, and dates 36 | elif end_date is None: 37 | query = keyword + "(from:" + user + ")(to:" + user + ")since:" + str(start_date.year) + "-" + str( 38 | start_date.month) + "-" + str(start_date.day) + "until:" + str(end_date.year) + "-" + str( 39 | end_date.month) + "-" + str(end_date.day) + " -RT" 40 | return self.parse_tweets(self.api.search(query, tweet_mode='extended')) 41 | # search with keyword and end date (used in get_weekly_tweets) 42 | elif user is None and start_date is None: 43 | query = keyword + "()until:" + str(end_date.year) + "-" + str(end_date.month) + "-" + str( 44 | end_date.day) + " -RT" 45 | return self.parse_tweets(self.api.search(query, tweet_mode='extended', count=100)) 46 | 47 | # gets weekly tweets 48 | def get_weekly_tweets(self, keyword): 49 | today = datetime.today() 50 | cur_date = today 51 | sub_week = timedelta(days=6, hours=20) 52 | last_week = today - sub_week 53 | tweet_list = TweetList() 54 | query_count = 0 55 | while cur_date > last_week: 56 | tweet_list.insert_list(self.search(keyword=keyword, end_date=cur_date)) 57 | query_count += 1 58 | cur_date = tweet_list.get_last().date 59 | print("Over the Past Week: \nThere have been {0} tweets about {1} which were collected over {2} queries".format( 60 | tweet_list.get_size(), keyword, query_count)) 61 | return tweet_list 62 | 63 | # gather a users lists members (helper method to get valuable users) 64 | # Rate: 75 65 | # Returns a python list 66 | def list_members(self, user, slug): 67 | members = [] 68 | for page in tweepy.Cursor(self.api.list_members, user, slug).items(): 69 | members.append(page) 70 | return [m.screen_name for m in members] 71 | 72 | # Method takes a user - gathers their lists and then compiles all members in those lists 73 | # Rate: 15 lists, 75 members 74 | # Returns a python list 75 | def get_valuable_users(self, base_user): 76 | # gather users lists 77 | lists = self.users_lists(base_user) 78 | valuable_users = [] 79 | count = 1 80 | seen = set(valuable_users) 81 | # adds members of lists into a set. Does not repeat users 82 | for item in lists: 83 | print(str(count) + "/" + str(len(lists))) 84 | slug = self.get_list_slug(item) 85 | print(slug) 86 | users = self.list_members(base_user, slug) 87 | count += 1 88 | for user in users: 89 | print(user) 90 | if user not in seen: 91 | seen.add(user) 92 | valuable_users.append(user) 93 | return valuable_users 94 | 95 | # gather a users lists 96 | # Rate: 15 97 | # Returns a list status object (tweepy defined) 98 | def users_lists(self, user): 99 | lists = [] 100 | for tweet_list in self.api.lists_all(user): 101 | if tweet_list.user.screen_name == user: 102 | lists.append(tweet_list) 103 | return lists 104 | 105 | # Returns the text of a tweet id 106 | def get_status(self, tweet): 107 | return self.api.get_status(tweet) 108 | 109 | @staticmethod 110 | # helper method to return a lists slug 111 | def get_list_slug(list_item): 112 | return list_item.slug 113 | 114 | # helper method to transform tweet into usable tweet object 115 | # returns a tweet list object (Data) 116 | @staticmethod 117 | def parse_tweets(data): 118 | tweet_list = TweetList() 119 | for tweet in data: 120 | tweet_obj = Tweet() 121 | tweet_obj.add_tweet_json(tweet) 122 | tweet_list.insert_data(tweet_obj) 123 | return tweet_list 124 | -------------------------------------------------------------------------------- /src/UIWidget/widget.py: -------------------------------------------------------------------------------- 1 | import tkinter 2 | 3 | 4 | # UI Class: This class creates a ui for manually scoring tweets 5 | class UserInterface: 6 | # default constructor 7 | def __init__(self, data, db, db_name): 8 | # initialize variables for iterating through data 9 | self.count = 1 10 | self.data = data 11 | self.db_name = db_name 12 | self.db = db 13 | 14 | # create root frame for ui, assign background color and initial size 15 | self.root = tkinter.Tk() 16 | self.root.configure(bg="#ebeef3") 17 | self.root.columnconfigure(0, weight=1) 18 | self.root.rowconfigure(0, weight=1) 19 | self.root.rowconfigure(1, weight=3) 20 | self.root.geometry('700x200') 21 | self.root.title("Twitter Sentiment Analysis Tool (TSAT) Training Set Builder") 22 | 23 | # create an upper frame container 24 | self.upper_frame = tkinter.Frame(self.root) 25 | self.upper_frame.grid(column=0, row=0) 26 | 27 | # create a lower frame container 28 | self.lower_frame = tkinter.Frame(self.root) 29 | self.lower_frame.grid(column=0, row=1) 30 | self.lower_frame.rowconfigure(0, weight=1) 31 | 32 | # create variables for updating text within labels 33 | self.text_var = tkinter.StringVar() 34 | self.count_text = tkinter.StringVar() 35 | 36 | # calls widget and runs it 37 | def classify(self): 38 | self.widget() 39 | self.root.mainloop() 40 | 41 | # lowers count by 1 and updates to go back a tweet 42 | def go_bck_callback(self): 43 | self.count -= 1 44 | self.update_text() 45 | 46 | # updates to next tweet and scores the prev tweet as positive 47 | def pos_callback(self): 48 | self.data.get_tweet(self.count).add_given_score(1) 49 | self.db.update_column_by_text(self.db_name, 'given_score', self.data.get_tweet(self.count).text, 50 | self.data.get_tweet(self.count).given_score) 51 | self.count += 1 52 | self.update_text() 53 | 54 | # updates to next tweet and scores the prev tweet as negative 55 | def neg_callback(self): 56 | self.data.get_tweet(self.count).add_given_score(-1) 57 | self.db.update_column_by_text(self.db_name, 'given_score', self.data.get_tweet(self.count).text, 58 | self.data.get_tweet(self.count).given_score) 59 | self.count += 1 60 | self.update_text() 61 | 62 | # updates to next tweet and scores the prev tweet as neutral 63 | def neutral_callback(self): 64 | self.data.get_tweet(self.count).add_given_score(0) 65 | self.db.update_column_by_text(self.db_name, 'given_score', self.data.get_tweet(self.count).text, 66 | self.data.get_tweet(self.count).given_score) 67 | self.count += 1 68 | self.update_text() 69 | 70 | # updates to next tweet and scores the prev tweet as irrelevant 71 | def irr_callback(self): 72 | self.data.get_tweet(self.count).add_given_score(100) 73 | self.db.update_column_by_text(self.db_name, 'given_score', self.data.get_tweet(self.count).text, 74 | self.data.get_tweet(self.count).given_score) 75 | self.count += 1 76 | self.update_text() 77 | 78 | # defines a user interface to select score of tweet 79 | def widget(self): 80 | # set initial variables 81 | # noinspection PyBroadException 82 | try: 83 | self.text_var.set(self.data.get_tweet(self.count).text) 84 | except Exception: 85 | self.text_var.set("CHARACTER OUT OF RANGE") 86 | print("Character is out of range") 87 | self.count_text.set("Count: " + str(self.count) + "/" + str(self.data.get_size())) 88 | 89 | # print tweet text 90 | tweet_text = tkinter.Label(self.upper_frame, wraplength='500', textvariable=self.text_var, relief="groove", 91 | fg='#3e4247', borderwidth=2, highlightcolor='#326690', font='Helvetica', 92 | bg='#d6effc') 93 | tweet_text.grid(column=1, row=0) 94 | 95 | # print counter next to tweet text 96 | counter = tkinter.Label(self.upper_frame, wraplength='70', textvariable=self.count_text, relief="groove", 97 | fg='#3e4247', borderwidth=2, highlightcolor='#326690', font='Helvetica', bg='#d6effc') 98 | counter.grid(column=2, row=0) 99 | 100 | # create buttons for iterating and choosing scores 101 | go_bck_button = tkinter.Button(self.lower_frame, text="Go Back", command=self.go_bck_callback, bg='#5b6d7c', 102 | font='Helvetica', fg='white') 103 | go_bck_button.grid(column=0, row=0) 104 | 105 | pos_button = tkinter.Button(self.lower_frame, text="Positive", command=self.pos_callback, bg='#5b6d7c', 106 | font='Helvetica', fg='white') 107 | pos_button.grid(column=3, row=0) 108 | 109 | neg_button = tkinter.Button(self.lower_frame, text="Negative", command=self.neg_callback, bg='#5b6d7c', 110 | font='Helvetica', fg='white') 111 | neg_button.grid(column=1, row=0) 112 | 113 | neutral_button = tkinter.Button(self.lower_frame, text="Neutral", command=self.neutral_callback, bg='#5b6d7c', 114 | font='Helvetica', fg='white') 115 | neutral_button.grid(column=2, row=0) 116 | 117 | irr_button = tkinter.Button(self.lower_frame, text="Irrelevant", command=self.irr_callback, bg='#5b6d7c', 118 | font='Helvetica', fg='white') 119 | irr_button.grid(column=4, row=0) 120 | 121 | # exception safety and updating tweets in real time 122 | def update_text(self): 123 | # terminate if count = size+1 124 | if self.count == self.data.get_size() + 1: 125 | self.text_var.set("Data Training is Comlpete. To edit, use the go back button. Otherwise close the window") 126 | # check to make sure user does not go to far back 127 | else: 128 | # noinspection PyBroadException 129 | try: 130 | self.text_var.set(self.data.get_tweet(self.count).text) 131 | self.count_text.set("Count: " + str(self.count) + "/" + str(self.data.get_size())) 132 | except IndexError: 133 | print("Tweet index not available") 134 | except Exception: 135 | self.text_var.set("CHARACTER OUT OF RANGE") 136 | print("Character is out of range") 137 | -------------------------------------------------------------------------------- /src/Visualization/plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | # Plot Class: Class is used to visualize classification and projection data 5 | class Plot: 6 | # initialize variables to 0 7 | def __init__(self, db): 8 | self.db = db 9 | self.pos_count = 0 10 | self.neg_count = 0 11 | self.irr_count = 0 12 | self.neutral_count = 0 13 | 14 | # from db, gather given scores and assign variables 15 | def generate_projections(self, db_name, column_name): 16 | data = self.db.get_column_data(db_name, column_name) 17 | print(data) 18 | for value in data: 19 | if value[0] == 1: 20 | self.pos_count += 1 21 | if value[0] == 100: 22 | self.irr_count += 1 23 | if value[0] == -1: 24 | self.neg_count += 1 25 | if value[0] == 0: 26 | self.neutral_count += 1 27 | 28 | # create a histogram with the given scores 29 | def build_projections_histogram(self): 30 | x_values = ['Negative', 'Neutral', 'Positive', 'Irrelevant'] 31 | y_values = [self.neg_count, self.neutral_count, self.pos_count, self.irr_count] 32 | 33 | fig, ax = plt.subplots() 34 | ax.bar(x_values, y_values, color='orange') 35 | plt.title("Given Scores for Training Set") 36 | plt.ylabel("Count") 37 | plt.xlabel("Given_Score") 38 | 39 | plt.show() 40 | 41 | # creates a histogram with the classified scores 42 | @staticmethod 43 | def create_classification_plot(pos_score, neg_score, neutral_score, irr_score): 44 | x_values = ['Negative', 'Neutral', 'Positive', 'Irrelevant'] 45 | y_values = [neg_score, neutral_score, pos_score, irr_score] 46 | 47 | fig, ax = plt.subplots() 48 | ax.bar(x_values, y_values, color='orange') 49 | plt.title("Given Scores for Classifying Test Set") 50 | plt.ylabel("Count") 51 | plt.xlabel("Given_Score") 52 | --------------------------------------------------------------------------------