├── fbmessages ├── __init__.py ├── __main__.py └── analyzer.py ├── .gitignore ├── requirements.txt ├── LICENSE └── README.md /fbmessages/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | messages 3 | results 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | unidecode 2 | matplotlib 3 | nltk 4 | vaderSentiment 5 | -------------------------------------------------------------------------------- /fbmessages/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import analyzer 3 | 4 | parser = argparse.ArgumentParser(description='Tool to analyze your Facebook Messenger history') 5 | parser.add_argument('file', help='Facebook chat messages in JSON format') 6 | 7 | args = parser.parse_args() 8 | analyzer.analyze(args.file) 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2020 David Hacker 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # facebook-message-analysis 2 | 3 | 1. Download your Facebook messenger history from your Facebook settings. 4 | [More here.](https://webapps.stackexchange.com/questions/27640/how-can-i-download-all-messages-from-facebook) 5 | 2. Unzip your data into the directory of your choice. 6 | 3. Identify a person whose chat history you want to analyze. 7 | 4. Find the JSON file listing all of their messages with you (named after their username). 8 | 1. We will refer to this file's path as **${FILE}**. 9 | 5. Clone this repository and change directory into it. 10 | ``` 11 | git clone https://github.com/dmhacker/facebook-message-analysis && cd facebook-message-analysis 12 | ``` 13 | 6. Install any dependencies. 14 | ``` 15 | pip install -r requirements.txt 16 | ``` 17 | 7. If you get an NTLK download error, use this command to resolve the issue. 18 | It will tell NTLK to download the appropriate stopwords file. 19 | ``` 20 | python 21 | >>> import nltk 22 | >>> nltk.download('stopwords') 23 | >>> quit() 24 | ``` 25 | 8. Run the analyzer. 26 | ``` 27 | python fbmessages ${FILE} 28 | ``` 29 | 30 | In a few seconds, you should get some nice visualizations. Have fun! 31 | -------------------------------------------------------------------------------- /fbmessages/analyzer.py: -------------------------------------------------------------------------------- 1 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 2 | from collections import namedtuple, defaultdict 3 | from operator import itemgetter 4 | from nltk.corpus import stopwords 5 | from unidecode import unidecode 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import datetime 10 | import heapq 11 | import string 12 | import time 13 | import json 14 | import copy 15 | 16 | 17 | english_stopwords = set(stopwords.words('english')) 18 | sentiment_analyzer = SentimentIntensityAnalyzer() 19 | cache = {} 20 | 21 | 22 | def _load_messages(filename): 23 | if filename in cache: 24 | return cache[filename] 25 | else: 26 | with open(filename) as jsonfile: 27 | data = json.load(jsonfile) 28 | cache[filename] = data 29 | return data 30 | 31 | 32 | def get_messages(filename, copy_from_cache=True): 33 | data = _load_messages(filename) 34 | 35 | # Copy the stored messages we have 36 | copied_messages = data['messages'] 37 | if copy_from_cache: 38 | copied_messages = copy.deepcopy(data['messages']) 39 | 40 | # Return a sorted list of messages by time 41 | return sorted(copied_messages, key=lambda message : message['timestamp_ms']) 42 | 43 | 44 | def analyze(filename): 45 | # Load messages 46 | print('Reading file {0} ...'.format(filename)) 47 | timestamp = time.perf_counter() 48 | messages = get_messages(filename, copy_from_cache=False) 49 | print('Loaded {0} messages in {1:.2f} seconds.'.format(len(messages), time.perf_counter() - timestamp)) 50 | 51 | print('Aggregating data ...') 52 | timestamp = time.perf_counter() 53 | 54 | # Data structures to hold information about the messages 55 | daily_counts = defaultdict(int) 56 | daily_sticker_counts = defaultdict(int) 57 | daily_sentiments = defaultdict(float) 58 | monthly_counts = defaultdict(int) 59 | monthly_sticker_counts = defaultdict(int) 60 | hourly_counts = defaultdict(int) 61 | day_name_counts = defaultdict(int) 62 | word_frequencies = defaultdict(int) 63 | first_date = None 64 | last_date = None 65 | 66 | # Extract information from the messages 67 | for message in messages: 68 | # Convert message's Unix timestamp to local datetime 69 | date = datetime.datetime.fromtimestamp(message['timestamp_ms']/1000.0) 70 | month = date.strftime('%Y-%m') 71 | day = date.strftime('%Y-%m-%d') 72 | day_name = date.strftime('%A') 73 | hour = date.time().hour 74 | 75 | # Get content in message if it has any 76 | if 'content' in message: 77 | content = unidecode(message['content']) 78 | 79 | # Increment message counts 80 | hourly_counts[hour] += 1 81 | day_name_counts[day_name] += 1 82 | daily_counts[day] += 1 83 | monthly_counts[month] += 1 84 | if 'sticker' in message: 85 | daily_sticker_counts[day] += 1 86 | monthly_sticker_counts[month] += 1 87 | 88 | # Rudimentary sentiment analysis using VADER 89 | sentiments = sentiment_analyzer.polarity_scores(content) 90 | daily_sentiments[day] += sentiments['pos'] - sentiments['neg'] 91 | 92 | # Determine word frequencies 93 | if content: 94 | # Split message up by spaces to get individual words 95 | for word in content.split(' '): 96 | # Make the word lowercase and strip it of punctuation 97 | new_word = word.lower().strip(string.punctuation) 98 | 99 | # Word might have been entirely punctuation; don't strip it 100 | if not new_word: 101 | new_word = word.lower() 102 | 103 | # Ignore word if it in the stopword set or if it is less than 2 characters 104 | if len(new_word) > 1 and new_word not in english_stopwords: 105 | word_frequencies[new_word] += 1 106 | 107 | # Determine start and last dates of messages 108 | if (first_date and first_date > date) or not first_date: 109 | first_date = date 110 | if (last_date and last_date < date) or not last_date: 111 | last_date = date 112 | 113 | # Take the average of the sentiment amassed for each day 114 | for day, message_count in daily_counts.items(): 115 | daily_sentiments[day] /= message_count 116 | 117 | # Get the number of days the messages span over 118 | num_days = (last_date - first_date).days 119 | 120 | # Get most common words 121 | top_words = heapq.nlargest(42, word_frequencies.items(), key=itemgetter(1)) 122 | 123 | print('Processed data in {0:.2f} seconds.'.format(time.perf_counter() - timestamp)) 124 | 125 | print('Preparing data for display ...') 126 | 127 | # Format data for graphing 128 | xdata_daily = sorted(list(daily_counts.keys())) 129 | ydata_daily = [daily_counts[x] for x in xdata_daily] 130 | ydata_daily_stickers = [daily_sticker_counts[x] for x in xdata_daily] 131 | xdata_monthly = sorted(list(monthly_counts.keys())) 132 | ydata_monthly = [monthly_counts[x] for x in xdata_monthly] 133 | ydata_monthly_stickers = [monthly_sticker_counts[x] for x in xdata_monthly] 134 | xdata_day_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'] 135 | ydata_day_name = [float(day_name_counts[x]) / num_days * 7 for x in xdata_day_name] 136 | xdata_hourly = ['{0}:00'.format(i) for i in range(24)] 137 | ydata_hourly = [float(hourly_counts[x]) / num_days for x in range(24)] 138 | xdata_sentiment = sorted(list(daily_sentiments.keys())) 139 | ydata_sentiment = [daily_sentiments[x] for x in xdata_sentiment] 140 | xdata_top_words, ydata_top_words = zip(*top_words) 141 | 142 | print('Displaying ...') 143 | 144 | # Generate subplots 145 | fig, ax_array = plt.subplots(2, 3) 146 | 147 | def show_daily_total_graph(ax, xdata, ydata, ydata_stickers): 148 | indices = np.arange(len(xdata)) 149 | 150 | ax.plot(indices, ydata, 151 | alpha=1.0, color='dodgerblue', 152 | label='All messages') 153 | 154 | ax.plot(indices, ydata_stickers, 155 | alpha=1.0, color='orange', 156 | label='Facebook stickers') 157 | 158 | ax.set_xlabel('Date') 159 | ax.set_ylabel('Count') 160 | ax.set_title('Number of messages exchanged every day') 161 | 162 | num_ticks = 16 if len(indices) >= 16 else len(indices) 163 | tick_spacing = round(len(indices) / num_ticks) 164 | ticks = [tick_spacing * i for i in range(num_ticks) if tick_spacing * i < len(xdata)] 165 | tick_labels = [xdata[tick] for tick in ticks] 166 | 167 | ax.set_xticks(ticks) 168 | ax.set_xticklabels(tick_labels) 169 | for tick in ax.get_xticklabels(): 170 | tick.set_rotation(30) 171 | 172 | ax.legend() 173 | 174 | def show_monthly_total_graph(ax, xdata, ydata, ydata_stickers): 175 | indices = np.arange(len(xdata)) 176 | 177 | ax.bar(indices, ydata, 178 | alpha=1.0, color='dodgerblue', 179 | label='All messages') 180 | 181 | ax.bar(indices, ydata_stickers, 182 | alpha=1.0, color='orange', 183 | label='Facebook stickers') 184 | 185 | ax.set_xlabel('Date') 186 | ax.set_ylabel('Count') 187 | ax.set_title('Number of messages exchanged every month') 188 | 189 | ax.set_xticks(indices) 190 | ax.set_xticklabels(xdata) 191 | for tick in ax.get_xticklabels(): 192 | tick.set_rotation(30) 193 | 194 | ax.legend() 195 | 196 | 197 | def show_day_name_average_graph(ax, xdata, ydata): 198 | indices = np.arange(len(xdata)) 199 | bar_width = 0.6 200 | 201 | ax.bar(indices, ydata, bar_width, 202 | alpha=1.0, color='dodgerblue', 203 | align='center', 204 | label='All messages') 205 | 206 | ax.set_xlabel('Day of the Week') 207 | ax.set_ylabel('Count') 208 | ax.set_title('Average number of messages every day of the week') 209 | 210 | ax.set_xticks(indices) 211 | ax.set_xticklabels(xdata) 212 | 213 | def show_hourly_average_graph(ax, xdata, ydata): 214 | indices = np.arange(len(xdata)) 215 | bar_width = 0.8 216 | 217 | ax.bar(indices, ydata, bar_width, 218 | alpha=1.0, color='dodgerblue', 219 | align='center', 220 | label='All messages') 221 | 222 | ax.set_xlabel('Hour') 223 | ax.set_ylabel('Count') 224 | ax.set_title('Average number of messages every hour of the day') 225 | 226 | ax.set_xticks(indices) 227 | ax.set_xticklabels(xdata) 228 | for tick in ax.get_xticklabels(): 229 | tick.set_rotation(30) 230 | 231 | def show_daily_sentiment_graph(ax, xdata, ydata): 232 | indices = np.arange(len(xdata)) 233 | 234 | ax.plot(indices, ydata, 235 | alpha=1.0, color='darkseagreen', 236 | label='VADER sentiment') 237 | 238 | ax.set_xlabel('Date') 239 | ax.set_ylabel('Sentiment') 240 | ax.set_title('Average sentiment over time') 241 | 242 | num_ticks = 16 if len(indices) >= 16 else len(indices) 243 | tick_spacing = round(len(indices) / num_ticks) 244 | ticks = [tick_spacing * i for i in range(num_ticks) if tick_spacing * i < len(xdata)] 245 | tick_labels = [xdata[tick] for tick in ticks] 246 | 247 | ax.set_xticks(ticks) 248 | ax.set_xticklabels(tick_labels) 249 | for tick in ax.get_xticklabels(): 250 | tick.set_rotation(30) 251 | ax.set_ylim([-1.0, 1.0]) 252 | 253 | ax.legend() 254 | 255 | def show_top_words_graph(ax, xdata, ydata): 256 | indices = np.arange(len(xdata)) 257 | bar_width = 0.8 258 | 259 | ax.barh(indices, ydata, bar_width, 260 | alpha=1.0, color='orchid', 261 | align='center', 262 | label='All messages') 263 | 264 | ax.set_ylabel('Word') 265 | ax.set_xlabel('Uses') 266 | ax.set_title('Our {0} most used words'.format(len(xdata))) 267 | 268 | ax.set_yticks(indices) 269 | ax.set_yticklabels(xdata) 270 | 271 | # Call the graphing methods 272 | show_daily_total_graph(ax_array[0][0], xdata_daily, ydata_daily, ydata_daily_stickers) 273 | show_monthly_total_graph(ax_array[0][1], xdata_monthly, ydata_monthly, ydata_monthly_stickers) 274 | show_daily_sentiment_graph(ax_array[0][2], xdata_sentiment, ydata_sentiment) 275 | show_day_name_average_graph(ax_array[1][0], xdata_day_name, ydata_day_name) 276 | show_hourly_average_graph(ax_array[1][1], xdata_hourly, ydata_hourly) 277 | show_top_words_graph(ax_array[1][2], xdata_top_words[::-1], ydata_top_words[::-1]) 278 | 279 | # Display the plots 280 | plt.show() 281 | 282 | print('Done.') 283 | --------------------------------------------------------------------------------