├── fbmessages
    ├── __init__.py
    ├── __main__.py
    └── analyzer.py
├── .gitignore
├── requirements.txt
├── LICENSE
└── README.md


/fbmessages/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | messages
3 | results
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | unidecode
2 | matplotlib
3 | nltk
4 | vaderSentiment
5 | 


--------------------------------------------------------------------------------
/fbmessages/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import analyzer
3 | 
4 | parser = argparse.ArgumentParser(description='Tool to analyze your Facebook Messenger history')
5 | parser.add_argument('file', help='Facebook chat messages in JSON format')
6 | 
7 | args = parser.parse_args()
8 | analyzer.analyze(args.file)
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright © 2020 David Hacker
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # facebook-message-analysis
 2 | 
 3 | 1. Download your Facebook messenger history from your Facebook settings. 
 4 | [More here.](https://webapps.stackexchange.com/questions/27640/how-can-i-download-all-messages-from-facebook)
 5 | 2. Unzip your data into the directory of your choice.
 6 | 3. Identify a person whose chat history you want to analyze.
 7 | 4. Find the JSON file listing all of their messages with you (named after their username).
 8 |     1. We will refer to this file's path as **${FILE}**.
 9 | 5. Clone this repository and change directory into it.
10 | ```
11 | git clone https://github.com/dmhacker/facebook-message-analysis && cd facebook-message-analysis
12 | ```
13 | 6. Install any dependencies.
14 | ```
15 | pip install -r requirements.txt
16 | ```
17 | 7. If you get an NTLK download error, use this command to resolve the issue. 
18 | It will tell NTLK to download the appropriate stopwords file.
19 | ```
20 | python
21 | >>> import nltk
22 | >>> nltk.download('stopwords')
23 | >>> quit()
24 | ```
25 | 8. Run the analyzer.
26 | ```
27 | python fbmessages ${FILE}
28 | ```
29 | 
30 | In a few seconds, you should get some nice visualizations. Have fun!
31 | 


--------------------------------------------------------------------------------
/fbmessages/analyzer.py:
--------------------------------------------------------------------------------
  1 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
  2 | from collections import namedtuple, defaultdict
  3 | from operator import itemgetter
  4 | from nltk.corpus import stopwords
  5 | from unidecode import unidecode
  6 | 
  7 | import numpy as np
  8 | import matplotlib.pyplot as plt
  9 | import datetime
 10 | import heapq
 11 | import string
 12 | import time
 13 | import json
 14 | import copy
 15 | 
 16 | 
 17 | english_stopwords = set(stopwords.words('english'))
 18 | sentiment_analyzer = SentimentIntensityAnalyzer()
 19 | cache = {}
 20 | 
 21 | 
 22 | def _load_messages(filename):
 23 |     if filename in cache:
 24 |         return cache[filename]
 25 |     else:
 26 |         with open(filename) as jsonfile:
 27 |             data = json.load(jsonfile)
 28 |             cache[filename] = data
 29 |             return data
 30 | 
 31 | 
 32 | def get_messages(filename, copy_from_cache=True):
 33 |     data = _load_messages(filename)
 34 | 
 35 |     # Copy the stored messages we have
 36 |     copied_messages = data['messages']
 37 |     if copy_from_cache:
 38 |         copied_messages = copy.deepcopy(data['messages'])
 39 | 
 40 |     # Return a sorted list of messages by time
 41 |     return sorted(copied_messages, key=lambda message : message['timestamp_ms'])
 42 | 
 43 | 
 44 | def analyze(filename):
 45 |     # Load messages
 46 |     print('Reading file {0} ...'.format(filename))
 47 |     timestamp = time.perf_counter()
 48 |     messages = get_messages(filename, copy_from_cache=False)
 49 |     print('Loaded {0} messages in {1:.2f} seconds.'.format(len(messages), time.perf_counter() - timestamp))
 50 | 
 51 |     print('Aggregating data ...')
 52 |     timestamp = time.perf_counter()
 53 | 
 54 |     # Data structures to hold information about the messages 
 55 |     daily_counts = defaultdict(int)
 56 |     daily_sticker_counts = defaultdict(int)
 57 |     daily_sentiments = defaultdict(float)
 58 |     monthly_counts = defaultdict(int)
 59 |     monthly_sticker_counts = defaultdict(int)
 60 |     hourly_counts = defaultdict(int)
 61 |     day_name_counts = defaultdict(int)
 62 |     word_frequencies = defaultdict(int)
 63 |     first_date = None 
 64 |     last_date = None 
 65 | 
 66 |     # Extract information from the messages 
 67 |     for message in messages:
 68 |         # Convert message's Unix timestamp to local datetime
 69 |         date = datetime.datetime.fromtimestamp(message['timestamp_ms']/1000.0)
 70 |         month = date.strftime('%Y-%m')
 71 |         day = date.strftime('%Y-%m-%d')
 72 |         day_name = date.strftime('%A')
 73 |         hour = date.time().hour
 74 | 
 75 |         # Get content in message if it has any
 76 |         if 'content' in message:
 77 |             content = unidecode(message['content'])
 78 | 
 79 |         # Increment message counts
 80 |         hourly_counts[hour] += 1 
 81 |         day_name_counts[day_name] += 1 
 82 |         daily_counts[day] += 1
 83 |         monthly_counts[month] += 1
 84 |         if 'sticker' in message:
 85 |             daily_sticker_counts[day] += 1
 86 |             monthly_sticker_counts[month] += 1
 87 | 
 88 |         # Rudimentary sentiment analysis using VADER
 89 |         sentiments = sentiment_analyzer.polarity_scores(content)
 90 |         daily_sentiments[day] += sentiments['pos'] - sentiments['neg']
 91 | 
 92 |         # Determine word frequencies
 93 |         if content: 
 94 |             # Split message up by spaces to get individual words
 95 |             for word in content.split(' '):
 96 |                 # Make the word lowercase and strip it of punctuation
 97 |                 new_word = word.lower().strip(string.punctuation)
 98 | 
 99 |                 # Word might have been entirely punctuation; don't strip it
100 |                 if not new_word:
101 |                     new_word = word.lower()
102 | 
103 |                 # Ignore word if it in the stopword set or if it is less than 2 characters
104 |                 if len(new_word) > 1 and new_word not in english_stopwords:
105 |                     word_frequencies[new_word] += 1 
106 | 
107 |         # Determine start and last dates of messages 
108 |         if (first_date and first_date > date) or not first_date:
109 |             first_date = date 
110 |         if (last_date and last_date < date) or not last_date:
111 |             last_date = date 
112 | 
113 |     # Take the average of the sentiment amassed for each day
114 |     for day, message_count in daily_counts.items():
115 |         daily_sentiments[day] /= message_count
116 | 
117 |     # Get the number of days the messages span over
118 |     num_days = (last_date - first_date).days
119 | 
120 |     # Get most common words
121 |     top_words = heapq.nlargest(42, word_frequencies.items(), key=itemgetter(1)) 
122 | 
123 |     print('Processed data in {0:.2f} seconds.'.format(time.perf_counter() - timestamp))
124 | 
125 |     print('Preparing data for display ...')
126 | 
127 |     # Format data for graphing
128 |     xdata_daily = sorted(list(daily_counts.keys()))
129 |     ydata_daily = [daily_counts[x] for x in xdata_daily]
130 |     ydata_daily_stickers = [daily_sticker_counts[x] for x in xdata_daily]
131 |     xdata_monthly = sorted(list(monthly_counts.keys()))
132 |     ydata_monthly = [monthly_counts[x] for x in xdata_monthly]
133 |     ydata_monthly_stickers = [monthly_sticker_counts[x] for x in xdata_monthly]
134 |     xdata_day_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
135 |     ydata_day_name = [float(day_name_counts[x]) / num_days * 7 for x in xdata_day_name]
136 |     xdata_hourly = ['{0}:00'.format(i) for i in range(24)]
137 |     ydata_hourly = [float(hourly_counts[x]) / num_days for x in range(24)]
138 |     xdata_sentiment = sorted(list(daily_sentiments.keys()))
139 |     ydata_sentiment = [daily_sentiments[x] for x in xdata_sentiment]
140 |     xdata_top_words, ydata_top_words = zip(*top_words) 
141 | 
142 |     print('Displaying ...')
143 | 
144 |     # Generate subplots
145 |     fig, ax_array = plt.subplots(2, 3)
146 | 
147 |     def show_daily_total_graph(ax, xdata, ydata, ydata_stickers):
148 |         indices = np.arange(len(xdata))
149 | 
150 |         ax.plot(indices, ydata, 
151 |                 alpha=1.0, color='dodgerblue', 
152 |                 label='All messages')
153 | 
154 |         ax.plot(indices, ydata_stickers, 
155 |                 alpha=1.0, color='orange', 
156 |                 label='Facebook stickers')
157 | 
158 |         ax.set_xlabel('Date')
159 |         ax.set_ylabel('Count')
160 |         ax.set_title('Number of messages exchanged every day')
161 | 
162 |         num_ticks = 16 if len(indices) >= 16 else len(indices)
163 |         tick_spacing = round(len(indices) / num_ticks)
164 |         ticks = [tick_spacing * i for i in range(num_ticks) if tick_spacing * i < len(xdata)]
165 |         tick_labels = [xdata[tick] for tick in ticks]
166 | 
167 |         ax.set_xticks(ticks)
168 |         ax.set_xticklabels(tick_labels)
169 |         for tick in ax.get_xticklabels():
170 |             tick.set_rotation(30)
171 | 
172 |         ax.legend()
173 | 
174 |     def show_monthly_total_graph(ax, xdata, ydata, ydata_stickers):
175 |         indices = np.arange(len(xdata))
176 | 
177 |         ax.bar(indices, ydata, 
178 |                 alpha=1.0, color='dodgerblue', 
179 |                 label='All messages')
180 | 
181 |         ax.bar(indices, ydata_stickers, 
182 |                 alpha=1.0, color='orange', 
183 |                 label='Facebook stickers')
184 | 
185 |         ax.set_xlabel('Date')
186 |         ax.set_ylabel('Count')
187 |         ax.set_title('Number of messages exchanged every month')
188 | 
189 |         ax.set_xticks(indices)
190 |         ax.set_xticklabels(xdata)
191 |         for tick in ax.get_xticklabels():
192 |             tick.set_rotation(30)
193 | 
194 |         ax.legend()
195 | 
196 | 
197 |     def show_day_name_average_graph(ax, xdata, ydata):
198 |         indices = np.arange(len(xdata))
199 |         bar_width = 0.6
200 | 
201 |         ax.bar(indices, ydata, bar_width,
202 |                 alpha=1.0, color='dodgerblue',
203 |                 align='center',
204 |                 label='All messages')
205 | 
206 |         ax.set_xlabel('Day of the Week')
207 |         ax.set_ylabel('Count')
208 |         ax.set_title('Average number of messages every day of the week')
209 | 
210 |         ax.set_xticks(indices)
211 |         ax.set_xticklabels(xdata)
212 | 
213 |     def show_hourly_average_graph(ax, xdata, ydata):
214 |         indices = np.arange(len(xdata))
215 |         bar_width = 0.8
216 | 
217 |         ax.bar(indices, ydata, bar_width, 
218 |                 alpha=1.0, color='dodgerblue',
219 |                 align='center',
220 |                 label='All messages')
221 | 
222 |         ax.set_xlabel('Hour')
223 |         ax.set_ylabel('Count')
224 |         ax.set_title('Average number of messages every hour of the day')
225 | 
226 |         ax.set_xticks(indices)
227 |         ax.set_xticklabels(xdata)
228 |         for tick in ax.get_xticklabels():
229 |             tick.set_rotation(30)
230 | 
231 |     def show_daily_sentiment_graph(ax, xdata, ydata):
232 |         indices = np.arange(len(xdata))
233 | 
234 |         ax.plot(indices, ydata, 
235 |                 alpha=1.0, color='darkseagreen', 
236 |                 label='VADER sentiment')
237 | 
238 |         ax.set_xlabel('Date')
239 |         ax.set_ylabel('Sentiment')
240 |         ax.set_title('Average sentiment over time')
241 | 
242 |         num_ticks = 16 if len(indices) >= 16 else len(indices)
243 |         tick_spacing = round(len(indices) / num_ticks)
244 |         ticks = [tick_spacing * i for i in range(num_ticks) if tick_spacing * i < len(xdata)]
245 |         tick_labels = [xdata[tick] for tick in ticks]
246 | 
247 |         ax.set_xticks(ticks)
248 |         ax.set_xticklabels(tick_labels)
249 |         for tick in ax.get_xticklabels():
250 |             tick.set_rotation(30)
251 |         ax.set_ylim([-1.0, 1.0])
252 | 
253 |         ax.legend()
254 | 
255 |     def show_top_words_graph(ax, xdata, ydata):
256 |         indices = np.arange(len(xdata))
257 |         bar_width = 0.8
258 | 
259 |         ax.barh(indices, ydata, bar_width,
260 |                 alpha=1.0, color='orchid',
261 |                 align='center',
262 |                 label='All messages')
263 | 
264 |         ax.set_ylabel('Word')
265 |         ax.set_xlabel('Uses')
266 |         ax.set_title('Our {0} most used words'.format(len(xdata)))
267 | 
268 |         ax.set_yticks(indices)
269 |         ax.set_yticklabels(xdata)
270 | 
271 |     # Call the graphing methods
272 |     show_daily_total_graph(ax_array[0][0], xdata_daily, ydata_daily, ydata_daily_stickers)
273 |     show_monthly_total_graph(ax_array[0][1], xdata_monthly, ydata_monthly, ydata_monthly_stickers)
274 |     show_daily_sentiment_graph(ax_array[0][2], xdata_sentiment, ydata_sentiment)
275 |     show_day_name_average_graph(ax_array[1][0], xdata_day_name, ydata_day_name)
276 |     show_hourly_average_graph(ax_array[1][1], xdata_hourly, ydata_hourly)
277 |     show_top_words_graph(ax_array[1][2], xdata_top_words[::-1], ydata_top_words[::-1])
278 | 
279 |     # Display the plots
280 |     plt.show()
281 | 
282 |     print('Done.')
283 | 


--------------------------------------------------------------------------------