├── __init__.py
├── .gitignore
├── name_hash.py
├── helpers.py
├── README.md
├── setup.py
├── experimental.py
├── group_message_analysis.py
├── name_dump.txt
└── private_message_analysis.py


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | config.py
 2 | data/
 3 | data*/
 4 | __pycache__
 5 | friends.py
 6 | most_messaged_per_month.txt
 7 | message_dump.txt
 8 | temp.py
 9 | .vscode
10 | *.out
11 | *.json


--------------------------------------------------------------------------------
/name_hash.py:
--------------------------------------------------------------------------------
 1 | class NameHasher():
 2 |     def __init__(self):
 3 |         with open("name_dump.txt", "r") as f:
 4 |             self.anon_names = [name.strip() for name in f.readlines()]
 5 | 
 6 |     def hash_by_index(self, index):
 7 |         """Given an index, return a name"""
 8 |         bounded_index = index % len(self.anon_names)
 9 |         return self.anon_names[bounded_index]
10 | 
11 |     def hash_by_name(self, name_in):
12 |         """Hash an input name to an anonymous name"""
13 |         index = hash(name_in)
14 |         return self.hash_by_index(index)


--------------------------------------------------------------------------------
/helpers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import datetime
 3 | from collections import defaultdict
 4 | 
 5 | 
 6 | def get_json(path):
 7 |     with open(path, "r") as f:
 8 |         return json.loads(f.read())
 9 | 
10 | 
11 | def check_participants(message_json):
12 |     """To check 1 on 1 messages"""
13 |     return len(message_json.get("participants", [])) == 2
14 |     # return len(message_json.get("participants", [])) == 1
15 | 
16 | 
17 | def bucket_datetime(timestamp, period="Month"):
18 |     """
19 |     We aggregate data such as message count by casting them to a datetime bucket
20 | 
21 |     For example, if we want data by month, an event happens on August 5th, 1998
22 |     will be cast to the August-1998 bucket which allows use to treat all events
23 |     on August 1998 the same
24 |     """
25 |     if period == "Day":
26 |         return datetime.datetime(year=timestamp.year, month=timestamp.month, day=timestamp.day)
27 |     elif period == "Month":
28 |         return datetime.datetime(year=timestamp.year, month=timestamp.month, day=1)
29 |     elif period == "Year":
30 |         return datetime.datetime(year=timestamp.year, month=1, day=1)
31 |     raise Exception("Unsupported period: %s", period)
32 | 
33 | 
34 | def count_messages(messages):
35 |     counters = defaultdict(int)
36 |     participants = set()
37 |     for message in messages:
38 |         sender = message.get("sender_name", "")
39 |         participants.add(sender)
40 |         counters[sender] += 1
41 |     return sum(counters.values()) if len(participants) == 2 else 0
42 | 
43 | 
44 | def time_format(period):
45 |     """strftime formatting"""
46 |     if period == "Day":
47 |         return "%m-%d-%Y"
48 |     elif period == "Month":
49 |         return "%m-%Y"
50 |     elif period == "Year":
51 |         return "%Y"
52 | 
53 | 
54 | def message_dump(messages, period="Month"):
55 |     """
56 |     Dump messages from a specific time
57 |     """
58 |     for message in reversed(messages):
59 |         participant = message["sender_name"]
60 | 
61 |         # Grab timestamp from message and cast it to a month + year timestamp
62 |         timestamp = datetime.datetime.fromtimestamp(
63 |             message["timestamp_ms"]/1000)
64 |         m_time = bucket_datetime(timestamp, period=period)
65 | 
66 |         # We use this to get all messages from a certain month
67 |         TARGET = datetime.datetime(year=2017, month=10, day=1)
68 |         if TARGET == m_time:
69 |             with open("message_dump.txt", 'a') as f:
70 |                 f.write(participant + ": " + message.get("content", "") + "\n")
71 | 
72 | 
73 | width_dict = {
74 |     "Year": 200,
75 |     "Month": 35,
76 |     "Day": 8
77 | }
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Facebook Message Analysis
 2 | 
 3 | Analyze Facebook messages for interesting statistics
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | ```
 8 | python3 and these libraries:
 9 | pip install matplotlib # matplotlib
10 | pip install numpy      # numpy
11 | pip install tabulate   # tabulate
12 | ```
13 | 
14 | ## Getting Started
15 | Clone the repo
16 | ```
17 | git clone https://github.com/Strafos/fb_messenger_analysis
18 | ```
19 | Follow instructions [here](https://www.facebook.com/help/1701730696756992?helpref=hc_global_nav) to download your Facebook message data.
20 | 
21 | Make sure to select "JSON" for format. The quality doesn't matter since we are only looking at texts.
22 | 
23 | After Facebook processes the request (could take up to a couple hours), unzip the data and put it in the repository
24 | 
25 | ## Setup
26 | 
27 | `setup.py` will generate `friends.py` which contains paths to relevant message dumps.
28 | 
29 | `setup.py` requires two arguments: 
30 | 
31 | `--dir`, the directory which has the unzipped facebook message data and
32 | 
33 | `--name`, which should be your Facebook name in the format "John Smith"
34 | 
35 | My setup looks like this
36 | 
37 | ```
38 | python setup.py --dir data --name "Zaibo Wang"
39 | ```
40 | 
41 | because my message data directory is in my repository
42 | 
43 | `friends.py` will list out the top 50 (by default) most messaged friends in order.
44 | 
45 | ## Examples
46 | 
47 | ### Private messages
48 | 
49 | `private_message_analysis.py` analyzes 1 on 1 messages. All methods are in the main method and commented out by default. Generally, four statistics are supported:
50 | 
51 | * Characters: total characters
52 | * Messages: total times enter is pressed
53 | * Clusters: all messages sent before being interupted by other participant
54 | * Words: count of elements split by spaces
55 | 
56 | The supported time periods are Year, Month, Day
57 | 
58 | All friends were initialized in `friends.py`. To access a friend in `private_message_analysis.py`, use the variable `friends.JOHN_SMITH`
59 | 
60 | I used a name hash in the following example outputs so they don't use friends' real names
61 | 
62 | ---
63 | `graph_stat` will create a bar graph of a given stat over a period. Default graphs Messages per Year between you and your best friend (most messaged friend).
64 | 
65 | ![graph_messages](https://i.imgur.com/B6yaCSU.png)
66 | ![graph_characters](https://i.imgur.com/Qo9s2TY.png)
67 | --- 
68 | `n_top_stat` shows the top n people of a certain stat by the period. By default, it is set to the top 4 characters per month (I think this statistic is the most interesting)
69 | 
70 | ![n_top_stat](https://i.imgur.com/YHSfP6I.png)
71 | ---
72 | `count_links` gives an absolute count and ratio of links sent to/received from a person. By default, it only calculates this for the top 20 most messaged friends (I find that after 20, there are few links and the data is not useful)
73 | 
74 | ![count_links](https://i.imgur.com/nzQvhG4.png)
75 | 
76 | ---
77 | `generate_averages` takes combinations of aforementioned stats (such as Characters per Message) and calculates the average per person over all (top 50) friends. 
78 | 
79 | ![generate_averages](https://i.imgur.com/NJ2OPnt.png)
80 | 
81 | ---
82 | 
83 | `count_specific_words` takes an array of words and a friend to compare word frequency.
84 | 
85 | ![count_specific_words](https://i.imgur.com/NoZHPsQ.png)
86 | ---
87 | `total_stat_sent` shows how many of a certain stat you have sent over a period. Default is total Characters per Year.
88 | 
89 | ![total_stat_send](https://i.imgur.com/vt9MYvF.png)
90 | 
91 | ### Group Messages
92 | `group_message_analysis.py` has the code to analyze group messages. It is a little tricker to set up. The easiest way to run is to pass a path to a group message.json to the main method in `group_message_analysis.py`.
93 | 
94 | I found some difficulty in finding group messages within my message dump so another way to do it is to use `find_groupchat()` in `setup.py`. This lets me specify a condition such as all groupchats with more than 15 participants. Then I add them to the GROUPCHAT variable in `setup.py` which will generate groupchats in `friends.py`. Then, these paths can be passed into the main method by `friends.${chat_name}`.
95 | 
96 | Result:
97 | ![group_chat](https://i.imgur.com/xzLZC60.png)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import argparse
  4 | import glob
  5 | from pprint import pprint
  6 | 
  7 | from helpers import get_json, count_messages, check_participants
  8 | 
  9 | my_name = None
 10 | 
 11 | """
 12 | This file generates friends.py which is needed for all data analysis
 13 | """
 14 | 
 15 | # To look at groupchats, use find_groupchat() in setup.py
 16 | # by adding your conditions to narrow down the search
 17 | # Then, add them to the GROUPCHATS list
 18 | GROUPCHATS = [
 19 |     # Format is a tuple (name, path):
 20 |     # ("situation_room", "/home/zaibo/code/fb_analysis/data/thesituationroom_69ae5d10b1/message.json"),
 21 |     # ("eggplant", "/home/zaibo/code/fb_analysis/data/96a68cd96d/message.json")
 22 | ]
 23 | 
 24 | 
 25 | def find_groupchat():
 26 |     """
 27 |     generate will not generate group chats, so we must find them manually
 28 |     We can set up conditions to narrow down the chats (ex: find all groupchats with 15+ people)
 29 |     """
 30 |     all_paths = []
 31 |     for dir in os.listdir(base_dir):
 32 |         inner_dir = base_dir + "/" + dir
 33 |         for filename in os.listdir(inner_dir):
 34 |             if filename == "message.json":
 35 |                 filepath = inner_dir + "/" + filename
 36 |                 all_paths.append(filepath)
 37 | 
 38 |     for path in all_paths:
 39 |         message_json = get_json(path)
 40 |         party = message_json.get("participants", "")
 41 |         # Make some condition to look for group chats, this one is 15+ participants
 42 |         if len(party) > 15:
 43 |             print(path)
 44 | 
 45 | 
 46 | def generate_friends(n=50):
 47 |     """
 48 |     Generate friends.py which is used by most of the other scripts
 49 |     friends.py will contain paths to the top n most frequently messaged friends
 50 |     """
 51 |     all_paths = []
 52 |     for dir in os.listdir(base_dir):
 53 |         if dir.startswith("."):  # Macs have a .DS_STORE file which throws an exception
 54 |             continue
 55 |         inner_dir = base_dir + "/" + dir
 56 |         for filename in os.listdir(inner_dir):
 57 |             if filename == "message.json":
 58 |                 filepath = inner_dir + "/" + filename
 59 |                 all_paths.append(filepath)
 60 | 
 61 |     # Each element is a tuple of (friend_name, total_messages)
 62 |     messages_per_friend = []
 63 | 
 64 |     for path in all_paths:
 65 |         message_json = get_json(path)
 66 |         print(path)
 67 |         if check_participants(message_json):
 68 |             messages = message_json.get("messages", [])
 69 |             participant = message_json.get("participants")
 70 |             participant = [i for i in participant if i['name'] != my_name]
 71 |             if len(participant) != 1:
 72 |                 continue
 73 |             participant = participant[0]['name']
 74 |             total_messages = count_messages(messages)
 75 |             if total_messages != 0:
 76 |                 messages_per_friend.append((participant, total_messages, path))
 77 |     messages_per_friend.sort(key=lambda x: x[1], reverse=True)
 78 | 
 79 |     # People have weird names, this regex can break...
 80 |     name_pattern = "(?P<first_name>([A-Z]|-)*) (?P<last_name>([A-Z]|-)*)"
 81 |     with open("friends.py", "w") as f:
 82 |         # Create a "BEST_FRIEND" which will be the default path
 83 |         # BEST_FRIEND is the most messaged friend
 84 |         _, _, path = messages_per_friend[0]
 85 |         write_wrapper(f, "BEST_FRIEND", path)
 86 | 
 87 |         names_and_paths = []
 88 |         paths = []
 89 |         for name, _, path in messages_per_friend[:n]:
 90 |             name = name['name'].upper()
 91 |             regex = re.match(name_pattern, name)
 92 |             if not regex:
 93 |                 continue
 94 |             # Some people have weird names, I did not handle edge cases
 95 |             parsed_name = "_".join(
 96 |                 [regex.group("first_name"), regex.group("last_name")])
 97 |             parsed_name = parsed_name.replace(" ", "_").replace("-", "_")
 98 | 
 99 |             write_wrapper(f, parsed_name, path)
100 | 
101 |             names_and_paths.append((name, path))
102 |             paths.append(path)
103 |         f.write("ALL_FRIENDS = %s\n" % str(names_and_paths))
104 |         f.write("ALL_FRIEND_PATHS = %s\n" % str(paths))
105 | 
106 | 
107 | def generate_groupchats():
108 |     """
109 |     Use find_groupchat() to get groupchat paths and hardcode them to this function
110 |     to append them to the end of friends.py
111 |     """
112 |     with open("friends.py", "a") as f:
113 |         for name, path in GROUPCHATS:
114 |             write_wrapper(f, name, path)
115 | 
116 | 
117 | def generate_name():
118 |     with open("friends.py", "a") as f:
119 |         write_wrapper(f, "MY_NAME", my_name)
120 | 
121 | 
122 | def write_wrapper(f, variable, value):
123 |     f.write("%s = \"%s\"\n" % (variable, value))
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     parser = argparse.ArgumentParser(
128 |         description='Configs for setting up data source')
129 |     parser.add_argument(
130 |         '--dir', help="Path to unzipped messages directory", required=True)
131 |     parser.add_argument(
132 |         '--name', help="Your name in the format 'John Smith'", required=True)
133 |     args = parser.parse_args()
134 | 
135 |     base_dir = args.dir
136 |     my_name = args.name
137 | 
138 |     generate_friends(50)
139 |     generate_groupchats()
140 |     generate_name()
141 | 


--------------------------------------------------------------------------------
/experimental.py:
--------------------------------------------------------------------------------
  1 | from friends import MY_NAME
  2 | 
  3 | # Either these don't work or don't say anything useful
  4 | 
  5 | 
  6 | def message_freq(messages, participant):
  7 |     # After a gap in talking, who initiates first?
  8 |     # Obvious problem is conversations can go on hiatus for a couple days
  9 |     gaps = [.01, 1, 2, 5, 10]
 10 |     tdelta_gaps = [datetime.timedelta(days=i) for i in gaps]
 11 | 
 12 |     print("Gap\tZaibo Count\t%s Count\t%s %%\t %s %%" %
 13 |           (participant, MY_NAME, participant))
 14 |     for gap, gap_in_days in zip(tdelta_gaps, gaps):
 15 |         prev_msg_t = datetime_from_mtime(messages[-1]["timestamp_ms"])
 16 |         counters = {
 17 |             MY_NAME: 0,
 18 |             participant: 0  # Other participant. We assume there is only one
 19 |         }
 20 |         for message in reversed(messages):
 21 |             curr_msg_t = datetime_from_mtime(message["timestamp_ms"])
 22 |             sender = message["sender_name"]
 23 |             t_delta = curr_msg_t - prev_msg_t
 24 |             if t_delta > gap:
 25 |                 counters[sender] += 1
 26 |             prev_msg_t = curr_msg_t
 27 |         total_count = sum(counters.values()) + 1
 28 |         print("%f\t%d\t%d\t%f\t%f" % (gap_in_days,
 29 |                                       counters[MY_NAME],
 30 |                                       counters[participant],
 31 |                                       counters[MY_NAME]/total_count,
 32 |                                       counters[participant]/total_count))
 33 | 
 34 | 
 35 | def generate_normalization(messages):
 36 |     counters = {}
 37 |     for message in messages:
 38 |         sender = message["sender_name"]
 39 |         counters[sender] = 1 if sender not in counters else counters[sender] + 1
 40 | 
 41 | 
 42 | def average_spread(paths=friends.ALL_FRIEND_PATHS):
 43 |     # Experimental
 44 |     paths = paths[:20]
 45 |     stats = ["Characters", "Words", "Messages", "Clusters"]
 46 |     all_spreads = []
 47 |     all_zbo = []
 48 |     for path in paths:
 49 |         message_json = get_json(path)
 50 |         messages = message_json.get("messages", [])
 51 |         participant = message_json.get("participants")[0]
 52 |         data = get_all_stats(messages)
 53 | 
 54 |         spreads = []
 55 |         zbo = []
 56 |         for small_stat, big_stat in combinations(stats, 2):
 57 |             me = friends.MY_NAME
 58 |             other = [name for name in data["Characters"]["Month"]
 59 |                      if name != "total" and name != friends.MY_NAME][0]
 60 |             sender_averages = (
 61 |                 sum(data[small_stat]["Year"][me].values()) /
 62 |                 sum(data[big_stat]["Year"][me].values()),
 63 |                 sum(data[small_stat]["Year"][other].values()) /
 64 |                 sum(data[big_stat]["Year"][other].values())
 65 |             )
 66 |             # spread = sender_averages[0]/sender_averages[1]
 67 |             zbo.append(sender_averages[0])
 68 |             spread = sender_averages[0]-sender_averages[1]
 69 |             spreads.append(spread)
 70 |         all_spreads.append([other, *spreads])
 71 |         all_zbo.append([*zbo])
 72 |     inspect = 3
 73 |     all_spreads.sort(key=lambda x: x[inspect], reverse=True)
 74 |     # print(tabulate(all_spreads, headers=["Name", *["%s per %s" % combo for combo in combinations(stats, 2)]]))
 75 |     # bar = plt.hist([x[inspect] for x in all_spreads], 8)
 76 | 
 77 |     mean_stdev = []
 78 |     combos = ["%s per %s" % x for x in list(combinations(stats, 2))]
 79 |     zbo_stats = []
 80 |     for i in range(1, 7):
 81 |         avg = np.average([x[i] for x in all_spreads])
 82 |         stdev = np.std([x[i] for x in all_spreads])
 83 |         mean_stdev.append([combos[i-1], avg, stdev])
 84 | 
 85 |         avg = np.average([x[i-1] for x in all_zbo])
 86 |         stdev = np.std([x[i-1] for x in all_zbo])
 87 |         zbo_stats.append([combos[i-1], avg, stdev])
 88 |         # print("%s avg: %f\tstdev: %f" % (combos[i-1], avg, stdev))
 89 |     print(tabulate(mean_stdev, headers=[
 90 |           "Zaibo to other ratio", "Average", "STDEV"]))
 91 |     print("==============================================================")
 92 |     print(tabulate(zbo_stats, headers=["Zaibo stats", "Average", "STDEV"]))
 93 | 
 94 | 
 95 | def average_response_time(message_json):
 96 |     # Confounding data: need to know when a conversation ends
 97 |     participant = message_json.get("participants")[0]
 98 |     messages = message_json.get("messages")
 99 |     data = defaultdict(lambda: defaultdict(int))
100 | 
101 |     first_message = messages[-1]
102 |     prev_msg_t = datetime.datetime.fromtimestamp(first_message["timestamp_ms"])
103 |     prev_sender = first_message["sender_name"]
104 |     for message in reversed(messages):
105 |         curr_msg_t = datetime.datetime.fromtimestamp(message["timestamp_ms"])
106 |         curr_sender = message["sender_name"]
107 |         if curr_sender != prev_sender:
108 |             t_delta = curr_msg_t - prev_msg_t
109 |             data[curr_sender]["response_time"] += t_delta.total_seconds()
110 |             data[curr_sender]["responses"] += 1
111 |         prev_msg_t = curr_msg_t
112 |         prev_sender = curr_sender
113 | 
114 |     res = []
115 |     for k, v in data.items():
116 |         if k == friends.MY_NAME:
117 |             k = "%s + %s" % (k, participant)
118 |         res.append([k, v["response_time"]//60/v["responses"]])
119 |     return res
120 |     # for k, v in data.items():
121 |     #     print(k, v["response_time"]//60/v["responses"])
122 | 


--------------------------------------------------------------------------------
/group_message_analysis.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from collections import defaultdict
  4 | from pprint import pprint
  5 | 
  6 | import matplotlib.pyplot as plt
  7 | from tabulate import tabulate
  8 | 
  9 | from helpers import get_json
 10 | from name_hash import NameHasher
 11 | import friends
 12 | 
 13 | 
 14 | ANONYMOUS = False
 15 | nh = NameHasher()
 16 | 
 17 | 
 18 | def main(path):
 19 |     message_json = get_json(path)
 20 |     messages = message_json.get("messages", [])
 21 |     # groupchat_message_stats(messages)
 22 |     karma_stats(messages)
 23 | 
 24 | 
 25 | def karma_stats(messages):
 26 |     """
 27 |     Creates dictionary of counter + cluster + message + links data of all members of a group chat
 28 |     and displays data as table and pie chart
 29 |     """
 30 |     name_map = {
 31 |         "zaibo": 0,
 32 |         "zaibo wang": 0,
 33 |         "rishi": 1,
 34 |         "rishi tripathy": 1,
 35 |         "eric": 2,
 36 |         "eric li": 2,
 37 |         "jaidev": 3,
 38 |         "jaidev phadke": 3,
 39 |     }
 40 |     # matrix = [["-", "Z", "R", "E", "J"], ["Z", 0, 0, 0, 0], [
 41 |     #     "R", 0, 0, 0, 0], ["E", 0, 0, 0, 0], ["J", 0, 0, 0, 0]]
 42 |     matrix = [["-", "Z", "R", "E", "J"], ["Z", (0, 0), (0, 0), (0, 0), (0, 0)], [
 43 |         "R", (0, 0), (0, 0), (0, 0), (0, 0)], ["E", (0, 0), (0, 0), (0, 0), (0, 0)], ["J", (0, 0), (0, 0), (0, 0), (0, 0)]]
 44 |     events = {}
 45 |     counters = defaultdict(int)
 46 |     for message in messages:
 47 |         karma_re = r'(?i)(\-\-|\+\+)'
 48 |         # karma_re = r'(?i)(jaidev|jaidev phadke|zaibo|zaibo wang|rishi|rishi tripathy|eric|eric li)( ?)(\-\-|\+\+)'
 49 |         sender = message.get("sender_name", None)
 50 |         timestamp = message.get("timestamp_ms", None)
 51 |         content = message.get("content", "")
 52 |         if content:
 53 |             regex = re.findall(karma_re, content)
 54 |             if regex:
 55 |                 print(message)
 56 |                 # for receiver, _, inc in regex:
 57 |                 #     receiver_val = name_map[receiver.lower()]
 58 |                 #     sender_val = name_map[sender.lower()]
 59 |                 #     if receiver_val == 0 and 1 == sender_val:
 60 |                 #         print(message)
 61 |                 #     if inc == "++":
 62 |                 #         pos, neg = matrix[sender_val+1][receiver_val+1]
 63 |                 #         matrix[sender_val+1][receiver_val+1] = (pos+1, neg)
 64 |                 #     elif inc == "--":
 65 |                 #         pos, neg = matrix[sender_val+1][receiver_val+1]
 66 |                 #         matrix[sender_val+1][receiver_val+1] = (pos, neg-1)
 67 |     # print('\n'.join([str(row) for row in matrix]))
 68 |     # print(''.join('{:5}'.format(x) for x in ["-", "Z", "R", "E", "J"]))
 69 |     # print('\n'.join([''.join(['{:4}'.format(item) for item in row])
 70 |     #                  for row in matrix[1:]]))
 71 | 
 72 | 
 73 | def groupchat_message_stats(messages):
 74 |     """
 75 |     Creates dictionary of counter + cluster + message + links data of all members of a group chat
 76 |     and displays data as table and pie chart
 77 | 
 78 |     Example
 79 |     counters = {
 80 |         "characters": {
 81 |             "Person 1": 100,
 82 |             "Person 2": 50
 83 |         },
 84 |         "clusters": {
 85 |             "Person 1": 100,
 86 |             "Person 2": 50
 87 |         },
 88 |         "messages": {
 89 |             "Person 1": 200,
 90 |             "Person 2": 100
 91 |         }
 92 |     }
 93 |     """
 94 |     link_re = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
 95 |     counters = defaultdict(lambda: defaultdict(int))
 96 |     prev_sender = messages[-1]["sender_name"]
 97 |     for message in messages:
 98 |         sender = message["sender_name"]
 99 | 
100 |         if ANONYMOUS:
101 |             sender = nh.hash_by_name(sender)
102 | 
103 |         content = message.get("content", "")
104 | 
105 |         # Aggregate characters
106 |         chars = len(content)
107 |         counters["characters"][sender] = chars if not counters["characters"].get(
108 |             sender) else counters["characters"].get(sender) + chars
109 | 
110 |         # Aggregate messages
111 |         counters["messages"][sender] = 1 if not counters["messages"].get(
112 |             sender) else counters["messages"].get(sender) + 1
113 | 
114 |         # Aggregate clusters
115 |         if sender != prev_sender:
116 |             counters["clusters"][sender] = 1 if not counters["clusters"].get(
117 |                 sender) else counters["clusters"].get(sender) + 1
118 | 
119 |         # Aggregate links
120 |         num_links = len(re.findall(link_re, content))
121 |         counters["links"][sender] += num_links
122 | 
123 |         prev_sender = sender
124 | 
125 |     total_clusters = sum([v for k, v in counters["clusters"].items()])
126 |     total_messages = sum([v for k, v in counters["messages"].items()])
127 |     total_characters = sum([v for k, v in counters["characters"].items()])
128 |     total_links = sum([v for k, v in counters["links"].items()])
129 | 
130 |     # Assemble data in print_list format to print out as table
131 |     print_list = []
132 |     for messages, chars, clusters, links in zip(counters["messages"].items(),
133 |                                                 counters["characters"].items(),
134 |                                                 counters["clusters"].items(),
135 |                                                 counters["links"].items()):
136 |         # messages, chars, clusters are a tuple of (<Name>, <Count>)
137 |         # Ex: ("Zaibo Wang", 500)
138 |         name = messages[0]
139 |         print_list.append([name,
140 |                            float("%.3f" % (chars[1]/total_characters)),
141 |                            float("%.3f" % (messages[1]/total_messages)),
142 |                            float("%.3f" % (clusters[1]/total_clusters)),
143 |                            float("%.3f" % (links[1]/total_links))])
144 |     print_list.sort(key=lambda x: x[1], reverse=True)
145 |     print(tabulate(print_list, headers=[
146 |           "Name", "% characters", "% messages", "% clusters", "% links"]))
147 | 
148 |     # Generate pie charts
149 |     labels = [x[0] for x in print_list]
150 |     characters = [x[1] for x in print_list]
151 |     messages = [x[2] for x in print_list]
152 |     clusters = [x[3] for x in print_list]
153 |     links = [x[4] for x in print_list]
154 | 
155 |     ax1 = plt.subplot(411)
156 |     ax1.pie(characters, labels=labels, autopct='%1.2f%%',
157 |             startangle=90)
158 |     ax1.axis('equal')
159 |     plt.title("Characters", fontsize=20)
160 | 
161 |     ax1 = plt.subplot(412)
162 |     ax1.pie(clusters, labels=labels, autopct='%1.2f%%',
163 |             startangle=90)
164 |     ax1.axis('equal')
165 |     plt.title("Clusters", fontsize=20)
166 | 
167 |     ax1 = plt.subplot(413)
168 |     ax1.pie(messages, labels=labels, autopct='%1.2f%%',
169 |             startangle=90)
170 |     ax1.axis('equal')
171 |     plt.title("Messages", fontsize=20)
172 | 
173 |     ax1 = plt.subplot(414)
174 |     ax1.pie(links, labels=labels, autopct='%1.2f%%',
175 |             startangle=90)
176 |     ax1.axis('equal')
177 |     plt.title("Links", fontsize=20)
178 | 
179 |     plt.show()
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     # path = friends.situation_room
184 |     # path = friends.eggplant
185 |     path = "./situationroom_7_29_18.json"
186 |     main(path)
187 | 


--------------------------------------------------------------------------------
/name_dump.txt:
--------------------------------------------------------------------------------
  1 | Hayden Branch
  2 | Erick Vang
  3 | Santiago Barnett
  4 | Martha Berger
  5 | Dane Macdonald
  6 | Denise Casey
  7 | Mathew Hunter
  8 | Ciara David
  9 | Kenna York
 10 | Ulises Montoya
 11 | Evie Strong
 12 | Noel Lewis
 13 | Elliott Smith
 14 | Ryan Dennis
 15 | Aubrey Hull
 16 | Micah Pierce
 17 | Lucia Church
 18 | Lauren Riley
 19 | Moshe Davies
 20 | Brennen Lindsey
 21 | Braeden Boyd
 22 | Vincent Zamora
 23 | Catherine Hanson
 24 | Terrance Buchanan
 25 | Donte Mccall
 26 | Jorge Lucero
 27 | Haleigh Moon
 28 | Shyanne Hampton
 29 | Pranav Dawson
 30 | Diego Torres
 31 | Jaycee Schwartz
 32 | Amirah Sharp
 33 | Camilla Wheeler
 34 | Kamryn Malone
 35 | Chelsea Warren
 36 | Maxwell Robinson
 37 | Adolfo Christensen
 38 | Skylar Woods
 39 | Scarlet Holmes
 40 | Anderson Brennan
 41 | Stephanie Moreno
 42 | Shaylee Garrison
 43 | Kendal Freeman
 44 | Ariel Floyd
 45 | Isabella Clarke
 46 | Broderick Kelly
 47 | Tara Hutchinson
 48 | Olive Baird
 49 | Kiara Carrillo
 50 | Nyasia Noble
 51 | Ashly Davis
 52 | Nicholas Clay
 53 | Dereon Cuevas
 54 | Emanuel Cruz
 55 | Reagan Leon
 56 | Matthias Huang
 57 | Tomas Harvey
 58 | Cailyn Rhodes
 59 | Brenton Carson
 60 | Amiyah Bonilla
 61 | Sonny Kaufman
 62 | Junior Maynard
 63 | Cayden Stokes
 64 | Brielle Knox
 65 | Kadence Cook
 66 | Alden Webb
 67 | Belen Orozco
 68 | Jair Stephens
 69 | Sage Pineda
 70 | Lennon Richard
 71 | Bruce Mcclure
 72 | Bo Chapman
 73 | Emily Andrews
 74 | Audrey Ryan
 75 | Zoie Cline
 76 | Amber Cummings
 77 | Madison Ortega
 78 | Myla Cardenas
 79 | Penelope Winters
 80 | Brooklynn Baxter
 81 | Braylon Orr
 82 | Hassan Francis
 83 | Nikhil Mckay
 84 | Amy Barnes
 85 | Jackson Ramsey
 86 | Denisse Henry
 87 | Alena Krueger
 88 | Donavan Mendoza
 89 | Mathias Sims
 90 | Kaila Flores
 91 | Nataly Pennington
 92 | Keegan Olsen
 93 | Cyrus Newman
 94 | Marvin Benitez
 95 | Kendrick Bradford
 96 | Dante Shelton
 97 | Rowan Dillon
 98 | Raegan Willis
 99 | Christian Choi
100 | Rodrigo Conner
101 | Ricky Koch
102 | Leon Phelps
103 | Kiley Potter
104 | Benjamin Simon
105 | Jamya Jensen
106 | Lesly Summers
107 | Charlie Howell
108 | Leslie Cox
109 | Felicity Bond
110 | Agustin Jimenez
111 | Mariyah Weeks
112 | Christina Vazquez
113 | Kaylen Harper
114 | Karsyn Yang
115 | Keenan Bell
116 | Coby Morgan
117 | Adyson Hoover
118 | Ella Delgado
119 | Chaim Gregory
120 | Emilie Norris
121 | Salvador Bass
122 | Guadalupe Watson
123 | Bianca Ruiz
124 | Carissa Berg
125 | Luciana Rubio
126 | Grace Hamilton
127 | Zion Suarez
128 | Alexandria Leblanc
129 | Jamar Hendrix
130 | Yasmine Landry
131 | Kaiya Lester
132 | Amelie Mack
133 | Omar Tran
134 | Kymani Hurst
135 | Abigail Larsen
136 | Jamari Bailey
137 | Andy Andersen
138 | Reina Benson
139 | Allen Blankenship
140 | Camron Brock
141 | Jesse Bentley
142 | Laura Stanton
143 | Camden Conway
144 | Franco Mcfarland
145 | Jeffrey Mccarthy
146 | Cloe Garza
147 | Brynlee Cooper
148 | Jaylah Mata
149 | Demarion Morales
150 | Ishaan Butler
151 | Ty Fuentes
152 | Ezequiel Chen
153 | Savanah Mooney
154 | Makenzie Chung
155 | April Gutierrez
156 | Santos Moore
157 | Abbigail Duarte
158 | Emiliano Pratt
159 | Alma Frey
160 | Cullen Lyons
161 | Reagan Mason
162 | Skye Sanchez
163 | Christine Sosa
164 | August Salazar
165 | Abagail Shields
166 | Tamara Obrien
167 | Prince Ferguson
168 | Judah Serrano
169 | Alondra Neal
170 | Raina Benjamin
171 | Dennis Hays
172 | Miracle Mullins
173 | Holly Gates
174 | Ean Spence
175 | Jonathon Good
176 | Duncan Bowers
177 | Aaden Acosta
178 | Lana Hartman
179 | Lee Bennett
180 | Kareem Shannon
181 | Spencer Horne
182 | Esperanza Osborne
183 | Rigoberto Scott
184 | Aleah Osborn
185 | Ashlynn Hale
186 | Kenneth Forbes
187 | Rihanna Ball
188 | Morgan Bowen
189 | Koen Alvarado
190 | Abel Snyder
191 | Karina Vargas
192 | Glenn Mathews
193 | Averie Mcintosh
194 | Madelyn Wilkerson
195 | Kaylee Santos
196 | Zackary Booth
197 | Kadin Fuller
198 | Mikaela Bartlett
199 | Markus Fleming
200 | Zayden Schmidt
201 | Reese Ho
202 | Kassidy Cole
203 | Sidney Finley
204 | Braden Guerra
205 | Trystan Ochoa
206 | Leonel Sandoval
207 | Arturo Brady
208 | Sloane Mills
209 | Leyla Bishop
210 | Izabelle Sawyer
211 | Finnegan Kane
212 | Areli Schroeder
213 | Elyse Greene
214 | Yaretzi Chang
215 | Angel Rowe
216 | Ari Frank
217 | Jason Klein
218 | Lila Haas
219 | Jayden Camacho
220 | Ryleigh Herman
221 | Ellen Michael
222 | Hayden Oconnor
223 | Julia Roberts
224 | Lillianna Howe
225 | Sterling Griffin
226 | Andreas Middleton
227 | Fernanda Wilson
228 | Bryce Johns
229 | Branden Barry
230 | Darian Bender
231 | Janet Sheppard
232 | Alexzander Proctor
233 | Isaias Espinoza
234 | Tatiana May
235 | Johnathon Hess
236 | Camryn Livingston
237 | Jakob Sherman
238 | Eliezer Curry
239 | Jorden Villa
240 | Keely Oconnell
241 | Vance Wallace
242 | Brylee Braun
243 | Mollie Shea
244 | Janiya Castro
245 | Tania Jefferson
246 | Daniel Mcconnell
247 | Meredith Ellison
248 | Gauge Bush
249 | Elisa Hines
250 | Talan Martin
251 | Aubree Hickman
252 | Adam Clark
253 | Edith Horton
254 | Austin Mcknight
255 | Anthony Robles
256 | Ximena Baldwin
257 | Jace Carpenter
258 | Gunnar Wade
259 | Alivia Guerrero
260 | Katelynn Wells
261 | Taylor Bernard
262 | Carson Potts
263 | Wesley Goodman
264 | Brenda Savage
265 | Christopher Williams
266 | Ansley Thomas
267 | Nathanial Hinton
268 | Rubi Copeland
269 | Jakobe Owens
270 | Alvin Olson
271 | Kaliyah Coffey
272 | Annabella Dickson
273 | Jean Mitchell
274 | Konner Kirk
275 | Jaiden Mcintyre
276 | India Matthews
277 | Olivia Nguyen
278 | Paula Lamb
279 | Fiona Estrada
280 | Joslyn Poole
281 | Alina Preston
282 | Brayden Blevins
283 | Luna Mcpherson
284 | Eric Prince
285 | Chance Cunningham
286 | Jacqueline Jennings
287 | Cortez Chan
288 | Alisha Cordova
289 | Cordell Joyce
290 | Liana Atkinson
291 | Tyler Lowe
292 | Ainsley Harris
293 | Bernard Ali
294 | Britney Rios
295 | Robert Walker
296 | Ray Booker
297 | Elias Cortez
298 | Korbin Weber
299 | Kolten Miranda
300 | Tanner Mcmillan
301 | Ally Mccormick
302 | Waylon Reese
303 | Desiree Short
304 | Will Graves
305 | Corey Paul
306 | Jaron Bullock
307 | Mariam Sloan
308 | Albert Wiley
309 | Jerry Richmond
310 | Dustin Pace
311 | Kolton Houston
312 | Chana Harmon
313 | Josue Wang
314 | London Gould
315 | Russell Gross
316 | Eduardo Guzman
317 | Evan Watkins
318 | Ariel Mejia
319 | Katrina Gomez
320 | Dominick Martinez
321 | Billy Rowland
322 | Milagros Fletcher
323 | Santino Mercer
324 | Diamond White
325 | Jaelyn Schultz
326 | Zion Hanna
327 | Gwendolyn Giles
328 | Aaliyah Gray
329 | Derek Ewing
330 | Briley Ford
331 | Raven Figueroa
332 | Jadyn Kim
333 | Darnell Rojas
334 | Viviana Ferrell
335 | Jamal Patel
336 | Elena Luna
337 | Reed Navarro
338 | Gerald Bruce
339 | Caitlyn Becker
340 | Averi Kennedy
341 | Ahmed Sullivan
342 | Chace Jacobson
343 | Kristian Harrison
344 | Marques Pollard
345 | Sammy Downs
346 | Caitlin Hicks
347 | Theodore Tate
348 | Callie Park
349 | Zayne Norman
350 | Enrique Patton
351 | Breanna Stout
352 | Hudson Herrera
353 | Daniela Mcgrath
354 | Sharon Reed
355 | Kaylie Massey
356 | Brycen Munoz
357 | Alan Lang
358 | Kaleb Beck
359 | Easton Gonzalez
360 | Callum Kemp
361 | Clare Mahoney
362 | Sofia Novak
363 | Andrew Lawson
364 | Zaire Trevino
365 | Jennifer Ponce
366 | Yasmin Schaefer
367 | Derrick Pugh
368 | Ana Morrison
369 | Bridger Watts
370 | Angel Flynn
371 | Ashlee Mullen
372 | Ramiro Curtis
373 | Charity English
374 | Nathaly Reeves
375 | Lawson Mercado
376 | Maren Evans
377 | Nathan Wilkinson
378 | Samir Banks
379 | Taylor Ellis
380 | Geovanni Cochran
381 | Micheal Bautista
382 | Emmett Blair
383 | Bradley Chambers
384 | Carsen Dunlap
385 | Jadyn Wiggins
386 | Macey Valenzuela
387 | Jovany Contreras
388 | Sarahi Kline
389 | Tabitha Tapia
390 | Presley Santiago
391 | Lucille Mclaughlin
392 | Amira Shepherd
393 | Alessandra Douglas
394 | Adan Joseph
395 | Briana Monroe
396 | Kenya Lee
397 | Joe Day
398 | Jayleen Mcbride
399 | Anastasia Sanford
400 | Madyson Davidson
401 | Chandler Greer
402 | Donna Calderon
403 | Alani Perry
404 | Marley Lam
405 | Craig Bean
406 | Rebecca King
407 | Jensen Johnson
408 | Adrien Owen
409 | Micah Mays
410 | Cora Everett
411 | Jessie Murillo
412 | Aisha Wagner
413 | Helen Chase
414 | Brodie Hebert
415 | Ramon Ortiz
416 | Walker Myers
417 | Madalyn Ballard
418 | Toby Stevens
419 | Elise Wall
420 | Byron Thornton
421 | Kaden Merritt
422 | Romeo Fischer
423 | Amir Weiss
424 | Rhett Parrish
425 | Parker Valdez
426 | Peter Best
427 | Deacon Ritter
428 | Riley Hayden
429 | Kennedy Leonard
430 | Peyton Charles
431 | Jimmy Maldonado
432 | Akira Tyler
433 | Amaya Franklin
434 | Reginald Kaiser
435 | Mateo Simmons
436 | Gage Anthony
437 | Jane Travis
438 | Semaj Terry
439 | Roselyn Whitaker
440 | Kendall Vega
441 | Matteo Hobbs
442 | Brittany Phillips
443 | Marie Castillo
444 | Dominique Rodgers
445 | Draven Newton
446 | Carlo Andrade
447 | Annalise Key
448 | Desmond Brooks
449 | Paloma Adkins
450 | Lorelei Jackson
451 | Walter Little
452 | Joselyn Roy
453 | Sarai Donaldson
454 | Brooklyn Steele
455 | Allyson Russell
456 | Kianna Cross
457 | Yandel Jenkins
458 | Zain Velez
459 | Donald Parsons
460 | Griffin Sexton
461 | Hadley Melendez
462 | Mohammad Kent
463 | Devyn Salas
464 | Howard Glover
465 | Jacoby Krause
466 | Alonso Nash
467 | Javion Kirby
468 | Wade Cobb
469 | Armando Stephenson
470 | Bridget Lara
471 | Phoenix Mcmahon
472 | Kamren Skinner
473 | Drew Stewart
474 | Kristen Hill
475 | Delilah Allen
476 | Colin Valencia
477 | Antoine Garcia
478 | Rylee Holt
479 | Memphis Cannon
480 | Bruno Edwards
481 | Shaniya Spencer
482 | Jamiya Hawkins
483 | Patricia Rich
484 | Aleena Reid
485 | Lyric Shah
486 | Vivian Duffy
487 | Devin Gay
488 | Karissa Hansen
489 | Halle Gentry
490 | Quintin Schneider
491 | Alberto Cabrera
492 | Journey Hudson
493 | Adalyn Wilkins
494 | Hallie Mckenzie
495 | Julianna Miller
496 | Dakota Humphrey
497 | Lia Carr
498 | Amiya Fernandez
499 | Kamari Collier
500 | Alfred Ray


--------------------------------------------------------------------------------
/private_message_analysis.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | import re
  4 | from pprint import pprint
  5 | from tabulate import tabulate
  6 | from collections import defaultdict
  7 | from itertools import combinations
  8 | 
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | from matplotlib.dates import date2num
 12 | 
 13 | import friends
 14 | from name_hash import NameHasher
 15 | from helpers import get_json, bucket_datetime, time_format, width_dict
 16 | 
 17 | nh = NameHasher()
 18 | ANONYMOUS = False  # We can make the data anonymous by hashing all the names except our own
 19 | 
 20 | 
 21 | def generate_averages(paths=friends.ALL_FRIEND_PATHS):
 22 |     """ Analyze combinations of stats such as "Characters per Words" across all friends in paths"""
 23 |     stats = ["Characters", "Words", "Messages", "Clusters"]
 24 |     average_stats = []
 25 |     for path in paths:
 26 |         message_json = get_json(path)
 27 |         messages = message_json.get("messages", [])
 28 |         participant = message_json.get("participants")[0]['name']
 29 |         data = get_all_stats(messages)
 30 | 
 31 |         for sender in data["Characters"]["Month"]:
 32 |             if sender == "total":
 33 |                 continue
 34 |             sender_averages = []
 35 |             for small_stat, big_stat in combinations(stats, 2):
 36 |                 sender_averages.append(
 37 |                     sum(data[small_stat]["Year"][sender].values())/sum(data[big_stat]["Year"][sender].values()))
 38 |             if sender == friends.MY_NAME:
 39 |                 if ANONYMOUS:
 40 |                     sender = "%s + %s" % (friends.MY_NAME,
 41 |                                           nh.hash_by_name(participant))
 42 |                 else:
 43 |                     sender = "%s + %s" % (friends.MY_NAME, participant)
 44 |             average_stats.append([sender, *sender_averages])
 45 |     average_stats.sort(key=lambda x: x[3], reverse=True)
 46 | 
 47 |     print(tabulate(average_stats, headers=[
 48 |           "Name", *["%s per %s" % combo for combo in combinations(stats, 2)]]))
 49 | 
 50 | 
 51 | def get_all_stats(messages):
 52 |     """
 53 |     Given 1 on 1 messages, generate stats over periods
 54 | 
 55 |     Supported stats:
 56 |     "Characters": total characters
 57 |     "Messages": total times enter is pressed
 58 |     "Clusters": all messages sent before being interupted by other participant is one cluster
 59 |     "Words": Naively defined as length of space separated message
 60 | 
 61 |     data is a four layer dictionary
 62 |     Stat -> Period -> name -> datetime.datetime -> value
 63 |     data returns a "core data structure" given a Stat and Period key:
 64 |     {
 65 |         "name1": {
 66 |             datetime.datetime: stat_val1
 67 |         },
 68 |         "name2": {
 69 |             datetime.datetime: stat_val2
 70 |         },
 71 |         "total": {
 72 |             datetime.datetime: stat_val1+stat_val2
 73 |         },
 74 |     }
 75 |     Ex: data["Messages"]["Day"] gives daily total message statistic
 76 |     """
 77 |     periods = ["Year", "Month", "Day"]
 78 |     stats = ["Characters", "Messages", "Clusters", "Words"]
 79 | 
 80 |     # Create a four-layered defaultdict with default int leaf
 81 |     data = defaultdict(lambda: defaultdict(
 82 |         lambda: defaultdict(lambda: defaultdict(int))))
 83 | 
 84 |     prev_sender = None
 85 |     for message in reversed(messages):
 86 |         timestamp = datetime.datetime.fromtimestamp(
 87 |             message["timestamp_ms"]/1000)
 88 |         sender_name = message["sender_name"]
 89 |         if ANONYMOUS and sender_name != friends.MY_NAME:
 90 |             sender_name = nh.hash_by_name(sender_name)
 91 |         content = message.get("content", "")
 92 | 
 93 |         for period in periods:
 94 |             m_time = bucket_datetime(timestamp, period)
 95 | 
 96 |             # Aggregate for messages, characters, clusters, words
 97 |             for name in [sender_name, "total"]:
 98 |                 data["Characters"][period][name][m_time] += len(content)
 99 |                 data["Words"][period][name][m_time] += len(
100 |                     [i for i in content.split(" ") if ".com" not in i])
101 |                 # data["Words"][period][name][m_time] += len(content.split(" "))
102 |                 data["Messages"][period][name][m_time] += 1
103 |                 if sender_name != prev_sender:
104 |                     data["Clusters"][period][name][m_time] += 1
105 | 
106 |         prev_sender = sender_name
107 | 
108 |     return data
109 | 
110 | 
111 | def graph_stat(path=friends.BEST_FRIEND, stat="Messages", period="Year", avg=False):
112 |     """
113 |     graph_stat wrapper that parses from a path
114 |     """
115 |     message_json = get_json(path)
116 |     messages = message_json.get("messages", [])
117 | 
118 |     data = get_all_stats(messages)
119 |     _graph_stat(data, stat=stat, period=period, avg=avg)
120 | 
121 | 
122 | def _graph_stat(data, stat="Messages", period="Month", name="total", message_data=None, avg=False):
123 |     """
124 |     The real graph stat function
125 |     Graph parameterized stat from get_all_stats
126 |     """
127 | 
128 |     # Parse data and sort by dates
129 |     if not message_data:
130 |         message_data = data[stat][period][name]
131 |     dates = date2num(list(message_data.keys()))
132 |     counts = np.array(list(message_data.values()))
133 |     dates, counts = zip(*sorted(zip(dates, counts)))
134 | 
135 |     if avg:
136 |         dates, counts = list(dates), list(counts)
137 |         new_counts = counts[:]
138 |         for i in range(1, len(counts)-1):
139 |             new_counts[i] = counts[i-1] + counts[i] + counts[i+1]
140 |         counts = new_counts[1:-1]
141 |         dates = dates[1:-1]
142 | 
143 |     ### BAR GRAPH ###
144 |     bar = plt.bar(dates, counts, width=width_dict[period])
145 |     ax = plt.subplot(111)
146 |     ax.xaxis_date()
147 | 
148 |     ### SCATTER PLOT ###
149 |     # I think the bar graph displays data better
150 |     # scatter = plt.plot_date(dates, counts, '.', label=name)
151 |     # p1 = np.poly1d(np.polyfit(dates, counts, 10))
152 |     # best_fit_str = "%s best fit" % name
153 |     # best_fit = plt.plot_date(dates, p1(dates), '--', label=best_fit_str)
154 |     # plt.autoscale(True)
155 |     # plt.grid(True)
156 |     # plt.ylim(-100)
157 |     # plt.legend()
158 | 
159 |     plt.ylabel('# of %s' % stat)
160 |     plt.title("%s between %s per %s" % (stat, " and ".join(
161 |         [i for i in data[stat][period].keys() if i != "total"]), period))
162 | 
163 | 
164 | def top_n_stat(n=3, stat="Messages", period="Month", show_counts=False):
165 |     """
166 |     Print top n stat'd person per period in a table
167 |     """
168 |     res = defaultdict(list)
169 | 
170 |     for person, path in friends.ALL_FRIENDS:
171 |         message_json = get_json(path)
172 |         messages = message_json.get("messages", [])
173 |         name = message_json.get("participants")[0]
174 | 
175 |         if ANONYMOUS:
176 |             name = nh.hash_by_name(name)
177 | 
178 |         message_data = get_all_stats(messages)[stat][period]["total"]
179 | 
180 |         for date, count in message_data.items():
181 |             res[date].append((name, count))
182 | 
183 |     # We want to sort by date
184 |     res_list = sorted([[date, count_list] for date, count_list in res.items()])
185 | 
186 |     table_data = []
187 | 
188 |     for date, count_list in res_list[:]:
189 |         # Format date by period
190 |         date_str = date.strftime(time_format(period))
191 |         # Sort by count
192 |         count_list.sort(key=lambda x: x[1], reverse=True)
193 |         # Truncate to top n
194 |         count_list = count_list[:n]
195 |         if show_counts:
196 |             name_and_counts = []
197 |             for name, count in count_list:
198 |                 name = name['name']
199 |                 spaces = 30 - len(name) - len(str(count))
200 |                 spaces_str = " "*spaces
201 |                 s = spaces_str.join([name, str(count)])
202 |                 name_and_counts.append(s)
203 |             table_data.append([date_str, *name_and_counts])
204 |         else:
205 |             table_data.append(
206 |                 [date_str, *[name for name, count in count_list]])
207 |     print("Top %d Most %s per %s" % (n, stat, period))
208 |     print(tabulate(table_data, headers=[
209 |           period, *["#%d" % i for i in range(1, n+1)]]))
210 | 
211 |     # Attempt to use matplotlib for tables... ASCII seems better
212 |     # TODO better table?
213 |     # fig, ax = plt.subplots()
214 |     # fig.patch.set_visible(False)
215 |     # ax.axis('off')
216 |     # ax.axis('tight')
217 |     # col_labels = ["Month", *["#%d" % i for i in range(1, n+1)]]
218 |     # table = plt.table(cellText=table_data, colWidths=[0.1] * (n+1), loc='center', colLabels=col_labels)
219 | 
220 |     # # Center the month column
221 |     # cells = table.properties()["celld"]
222 |     # for i in range(len(cells)//(n+1)):
223 |     #     cells[i, 0]._loc = 'center'
224 |     # # Format table
225 |     # table.set_fontsize(24)
226 |     # table.scale(1.3, 1.1)
227 | 
228 |     # plt.title("Top %d %s Sent by per %s" % (n, stat, period))
229 | 
230 | 
231 | def total_stat_sent(stat="Messages", period="Year"):
232 |     """
233 |     Graph all of a stat sent by YOU
234 |     """
235 |     res = defaultdict(int)
236 | 
237 |     for person, path in friends.ALL_FRIENDS:
238 |         message_json = get_json(path)
239 |         messages = message_json.get("messages", [])
240 |         name = message_json.get("participants")[0]
241 | 
242 |         data = get_all_stats(messages)
243 |         message_data = data[stat][period][friends.MY_NAME]
244 | 
245 |         for date, count in message_data.items():
246 |             res[date] += count
247 | 
248 |     res_list = sorted([(date, count) for date, count in res.items()])
249 |     dates = [elem[0] for elem in res_list[:-1]]
250 |     counts = [elem[1] for elem in res_list[:-1]]
251 | 
252 |     bar = plt.bar(dates, counts, width=width_dict[period])
253 |     ax = plt.subplot(111)
254 |     ax.xaxis_date()
255 |     plt.ylabel('# of %s' % stat)
256 |     plt.title("Total %s Sent %s per %s" % (stat, friends.MY_NAME, period))
257 | 
258 | 
259 | def count_specific_words(WORDS, path=friends.BEST_FRIEND):
260 |     """
261 |     Count frequency of WORDS between people in paths
262 |     Should we normalization by message count per year?
263 |     """
264 |     counters = defaultdict(lambda: defaultdict(int))
265 | 
266 |     message_json = get_json(path)
267 |     messages = message_json.get("messages", [])
268 | 
269 |     for keyword in WORDS:
270 |         for message in messages:
271 |             sender = message["sender_name"]
272 |             if ANONYMOUS and sender != friends.MY_NAME:
273 |                 sender = nh.hash_by_name(sender)
274 |             content = message.get("content", "")
275 |             if "dick" in content:
276 |                 print(sender, content)
277 |             count = content.lower().count(keyword)
278 |             counters[keyword][sender] += count
279 |     table = []
280 |     for keyword, participants in counters.items():
281 |         table.append([keyword, *participants.values()])
282 |     print(tabulate(table, headers=["Word", *participants.keys()]))
283 | 
284 | 
285 | def count_links(paths=friends.ALL_FRIEND_PATHS[:20]):
286 |     """
287 |     Count links sent between friends
288 |     """
289 |     table = []
290 |     link_re = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
291 |     for path in paths:
292 |         message_json = get_json(path)
293 |         messages = message_json.get("messages", [])
294 |         participant = message_json.get("participants")[0]['name']
295 |         counters = defaultdict(int)
296 | 
297 |         for message in messages:
298 |             sender = message["sender_name"]
299 |             content = message.get("content", "")
300 |             num_links = len(re.findall(link_re, content))
301 |             counters[sender] += num_links
302 | 
303 |         table.append([
304 |             participant,
305 |             counters[friends.MY_NAME]/counters[participant],
306 |             counters[friends.MY_NAME],
307 |             counters[participant],
308 |             counters[friends.MY_NAME] + counters[participant]])
309 |     table.sort(key=lambda x: x[1], reverse=True)
310 | 
311 |     if ANONYMOUS:
312 |         for row in table:
313 |             row[0] = nh.hash_by_name(row[0])
314 | 
315 |     print(tabulate(table, headers=[
316 |           "Name", "Ratio of Links", "Sent by me", "Sent by other", "Total"]))
317 |     avg = np.average([x[1] for x in table if x[2] > 50])
318 |     stdev = np.std([x[1] for x in table])
319 |     print("Average Ratio: %f" % avg)
320 |     print("Ratio STDEV: %F" % stdev)
321 | 
322 | 
323 | if __name__ == "__main__":
324 |     """
325 |     supported periods: "Month", "Year", "Day"
326 | 
327 |     Supported stats:
328 |     "Characters": total characters
329 |     "Messages": total times enter is pressed
330 |     "Clusters": all messages sent before being interupted by other participant is one cluster
331 |     "Words": Naively defined as length of space separated message
332 |     """
333 |     graph_stat(friends.JAIDEV_PHADKE, stat="Characters",
334 |                period="Month", avg=True)
335 |     # top_n_stat(n=4, stat="Characters", period="Month", show_counts=True)
336 |     # count_links(friends.ALL_FRIEND_PATHS[:20])
337 |     # generate_averages(friends.ALL_FRIEND_PATHS)
338 |     # words = ["lol", "lool", "loool", "lmao", "haha", "hahaha", "hahahaha"]
339 |     # count_specific_words(words, friends.BEST_FRIEND)
340 |     # total_stat_sent(stat="Words", period="Year")
341 | 
342 |     plt.show(block=True)
343 | 


--------------------------------------------------------------------------------