├── __init__.py ├── .gitignore ├── name_hash.py ├── helpers.py ├── README.md ├── setup.py ├── experimental.py ├── group_message_analysis.py ├── name_dump.txt └── private_message_analysis.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | config.py 2 | data/ 3 | data*/ 4 | __pycache__ 5 | friends.py 6 | most_messaged_per_month.txt 7 | message_dump.txt 8 | temp.py 9 | .vscode 10 | *.out 11 | *.json -------------------------------------------------------------------------------- /name_hash.py: -------------------------------------------------------------------------------- 1 | class NameHasher(): 2 | def __init__(self): 3 | with open("name_dump.txt", "r") as f: 4 | self.anon_names = [name.strip() for name in f.readlines()] 5 | 6 | def hash_by_index(self, index): 7 | """Given an index, return a name""" 8 | bounded_index = index % len(self.anon_names) 9 | return self.anon_names[bounded_index] 10 | 11 | def hash_by_name(self, name_in): 12 | """Hash an input name to an anonymous name""" 13 | index = hash(name_in) 14 | return self.hash_by_index(index) -------------------------------------------------------------------------------- /helpers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import datetime 3 | from collections import defaultdict 4 | 5 | 6 | def get_json(path): 7 | with open(path, "r") as f: 8 | return json.loads(f.read()) 9 | 10 | 11 | def check_participants(message_json): 12 | """To check 1 on 1 messages""" 13 | return len(message_json.get("participants", [])) == 2 14 | # return len(message_json.get("participants", [])) == 1 15 | 16 | 17 | def bucket_datetime(timestamp, period="Month"): 18 | """ 19 | We aggregate data such as message count by casting them to a datetime bucket 20 | 21 | For example, if we want data by month, an event happens on August 5th, 1998 22 | will be cast to the August-1998 bucket which allows use to treat all events 23 | on August 1998 the same 24 | """ 25 | if period == "Day": 26 | return datetime.datetime(year=timestamp.year, month=timestamp.month, day=timestamp.day) 27 | elif period == "Month": 28 | return datetime.datetime(year=timestamp.year, month=timestamp.month, day=1) 29 | elif period == "Year": 30 | return datetime.datetime(year=timestamp.year, month=1, day=1) 31 | raise Exception("Unsupported period: %s", period) 32 | 33 | 34 | def count_messages(messages): 35 | counters = defaultdict(int) 36 | participants = set() 37 | for message in messages: 38 | sender = message.get("sender_name", "") 39 | participants.add(sender) 40 | counters[sender] += 1 41 | return sum(counters.values()) if len(participants) == 2 else 0 42 | 43 | 44 | def time_format(period): 45 | """strftime formatting""" 46 | if period == "Day": 47 | return "%m-%d-%Y" 48 | elif period == "Month": 49 | return "%m-%Y" 50 | elif period == "Year": 51 | return "%Y" 52 | 53 | 54 | def message_dump(messages, period="Month"): 55 | """ 56 | Dump messages from a specific time 57 | """ 58 | for message in reversed(messages): 59 | participant = message["sender_name"] 60 | 61 | # Grab timestamp from message and cast it to a month + year timestamp 62 | timestamp = datetime.datetime.fromtimestamp( 63 | message["timestamp_ms"]/1000) 64 | m_time = bucket_datetime(timestamp, period=period) 65 | 66 | # We use this to get all messages from a certain month 67 | TARGET = datetime.datetime(year=2017, month=10, day=1) 68 | if TARGET == m_time: 69 | with open("message_dump.txt", 'a') as f: 70 | f.write(participant + ": " + message.get("content", "") + "\n") 71 | 72 | 73 | width_dict = { 74 | "Year": 200, 75 | "Month": 35, 76 | "Day": 8 77 | } 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Facebook Message Analysis 2 | 3 | Analyze Facebook messages for interesting statistics 4 | 5 | ## Prerequisites 6 | 7 | ``` 8 | python3 and these libraries: 9 | pip install matplotlib # matplotlib 10 | pip install numpy # numpy 11 | pip install tabulate # tabulate 12 | ``` 13 | 14 | ## Getting Started 15 | Clone the repo 16 | ``` 17 | git clone https://github.com/Strafos/fb_messenger_analysis 18 | ``` 19 | Follow instructions [here](https://www.facebook.com/help/1701730696756992?helpref=hc_global_nav) to download your Facebook message data. 20 | 21 | Make sure to select "JSON" for format. The quality doesn't matter since we are only looking at texts. 22 | 23 | After Facebook processes the request (could take up to a couple hours), unzip the data and put it in the repository 24 | 25 | ## Setup 26 | 27 | `setup.py` will generate `friends.py` which contains paths to relevant message dumps. 28 | 29 | `setup.py` requires two arguments: 30 | 31 | `--dir`, the directory which has the unzipped facebook message data and 32 | 33 | `--name`, which should be your Facebook name in the format "John Smith" 34 | 35 | My setup looks like this 36 | 37 | ``` 38 | python setup.py --dir data --name "Zaibo Wang" 39 | ``` 40 | 41 | because my message data directory is in my repository 42 | 43 | `friends.py` will list out the top 50 (by default) most messaged friends in order. 44 | 45 | ## Examples 46 | 47 | ### Private messages 48 | 49 | `private_message_analysis.py` analyzes 1 on 1 messages. All methods are in the main method and commented out by default. Generally, four statistics are supported: 50 | 51 | * Characters: total characters 52 | * Messages: total times enter is pressed 53 | * Clusters: all messages sent before being interupted by other participant 54 | * Words: count of elements split by spaces 55 | 56 | The supported time periods are Year, Month, Day 57 | 58 | All friends were initialized in `friends.py`. To access a friend in `private_message_analysis.py`, use the variable `friends.JOHN_SMITH` 59 | 60 | I used a name hash in the following example outputs so they don't use friends' real names 61 | 62 | --- 63 | `graph_stat` will create a bar graph of a given stat over a period. Default graphs Messages per Year between you and your best friend (most messaged friend). 64 | 65 | ![graph_messages](https://i.imgur.com/B6yaCSU.png) 66 | ![graph_characters](https://i.imgur.com/Qo9s2TY.png) 67 | --- 68 | `n_top_stat` shows the top n people of a certain stat by the period. By default, it is set to the top 4 characters per month (I think this statistic is the most interesting) 69 | 70 | ![n_top_stat](https://i.imgur.com/YHSfP6I.png) 71 | --- 72 | `count_links` gives an absolute count and ratio of links sent to/received from a person. By default, it only calculates this for the top 20 most messaged friends (I find that after 20, there are few links and the data is not useful) 73 | 74 | ![count_links](https://i.imgur.com/nzQvhG4.png) 75 | 76 | --- 77 | `generate_averages` takes combinations of aforementioned stats (such as Characters per Message) and calculates the average per person over all (top 50) friends. 78 | 79 | ![generate_averages](https://i.imgur.com/NJ2OPnt.png) 80 | 81 | --- 82 | 83 | `count_specific_words` takes an array of words and a friend to compare word frequency. 84 | 85 | ![count_specific_words](https://i.imgur.com/NoZHPsQ.png) 86 | --- 87 | `total_stat_sent` shows how many of a certain stat you have sent over a period. Default is total Characters per Year. 88 | 89 | ![total_stat_send](https://i.imgur.com/vt9MYvF.png) 90 | 91 | ### Group Messages 92 | `group_message_analysis.py` has the code to analyze group messages. It is a little tricker to set up. The easiest way to run is to pass a path to a group message.json to the main method in `group_message_analysis.py`. 93 | 94 | I found some difficulty in finding group messages within my message dump so another way to do it is to use `find_groupchat()` in `setup.py`. This lets me specify a condition such as all groupchats with more than 15 participants. Then I add them to the GROUPCHAT variable in `setup.py` which will generate groupchats in `friends.py`. Then, these paths can be passed into the main method by `friends.${chat_name}`. 95 | 96 | Result: 97 | ![group_chat](https://i.imgur.com/xzLZC60.png) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import argparse 4 | import glob 5 | from pprint import pprint 6 | 7 | from helpers import get_json, count_messages, check_participants 8 | 9 | my_name = None 10 | 11 | """ 12 | This file generates friends.py which is needed for all data analysis 13 | """ 14 | 15 | # To look at groupchats, use find_groupchat() in setup.py 16 | # by adding your conditions to narrow down the search 17 | # Then, add them to the GROUPCHATS list 18 | GROUPCHATS = [ 19 | # Format is a tuple (name, path): 20 | # ("situation_room", "/home/zaibo/code/fb_analysis/data/thesituationroom_69ae5d10b1/message.json"), 21 | # ("eggplant", "/home/zaibo/code/fb_analysis/data/96a68cd96d/message.json") 22 | ] 23 | 24 | 25 | def find_groupchat(): 26 | """ 27 | generate will not generate group chats, so we must find them manually 28 | We can set up conditions to narrow down the chats (ex: find all groupchats with 15+ people) 29 | """ 30 | all_paths = [] 31 | for dir in os.listdir(base_dir): 32 | inner_dir = base_dir + "/" + dir 33 | for filename in os.listdir(inner_dir): 34 | if filename == "message.json": 35 | filepath = inner_dir + "/" + filename 36 | all_paths.append(filepath) 37 | 38 | for path in all_paths: 39 | message_json = get_json(path) 40 | party = message_json.get("participants", "") 41 | # Make some condition to look for group chats, this one is 15+ participants 42 | if len(party) > 15: 43 | print(path) 44 | 45 | 46 | def generate_friends(n=50): 47 | """ 48 | Generate friends.py which is used by most of the other scripts 49 | friends.py will contain paths to the top n most frequently messaged friends 50 | """ 51 | all_paths = [] 52 | for dir in os.listdir(base_dir): 53 | if dir.startswith("."): # Macs have a .DS_STORE file which throws an exception 54 | continue 55 | inner_dir = base_dir + "/" + dir 56 | for filename in os.listdir(inner_dir): 57 | if filename == "message.json": 58 | filepath = inner_dir + "/" + filename 59 | all_paths.append(filepath) 60 | 61 | # Each element is a tuple of (friend_name, total_messages) 62 | messages_per_friend = [] 63 | 64 | for path in all_paths: 65 | message_json = get_json(path) 66 | print(path) 67 | if check_participants(message_json): 68 | messages = message_json.get("messages", []) 69 | participant = message_json.get("participants") 70 | participant = [i for i in participant if i['name'] != my_name] 71 | if len(participant) != 1: 72 | continue 73 | participant = participant[0]['name'] 74 | total_messages = count_messages(messages) 75 | if total_messages != 0: 76 | messages_per_friend.append((participant, total_messages, path)) 77 | messages_per_friend.sort(key=lambda x: x[1], reverse=True) 78 | 79 | # People have weird names, this regex can break... 80 | name_pattern = "(?P([A-Z]|-)*) (?P([A-Z]|-)*)" 81 | with open("friends.py", "w") as f: 82 | # Create a "BEST_FRIEND" which will be the default path 83 | # BEST_FRIEND is the most messaged friend 84 | _, _, path = messages_per_friend[0] 85 | write_wrapper(f, "BEST_FRIEND", path) 86 | 87 | names_and_paths = [] 88 | paths = [] 89 | for name, _, path in messages_per_friend[:n]: 90 | name = name['name'].upper() 91 | regex = re.match(name_pattern, name) 92 | if not regex: 93 | continue 94 | # Some people have weird names, I did not handle edge cases 95 | parsed_name = "_".join( 96 | [regex.group("first_name"), regex.group("last_name")]) 97 | parsed_name = parsed_name.replace(" ", "_").replace("-", "_") 98 | 99 | write_wrapper(f, parsed_name, path) 100 | 101 | names_and_paths.append((name, path)) 102 | paths.append(path) 103 | f.write("ALL_FRIENDS = %s\n" % str(names_and_paths)) 104 | f.write("ALL_FRIEND_PATHS = %s\n" % str(paths)) 105 | 106 | 107 | def generate_groupchats(): 108 | """ 109 | Use find_groupchat() to get groupchat paths and hardcode them to this function 110 | to append them to the end of friends.py 111 | """ 112 | with open("friends.py", "a") as f: 113 | for name, path in GROUPCHATS: 114 | write_wrapper(f, name, path) 115 | 116 | 117 | def generate_name(): 118 | with open("friends.py", "a") as f: 119 | write_wrapper(f, "MY_NAME", my_name) 120 | 121 | 122 | def write_wrapper(f, variable, value): 123 | f.write("%s = \"%s\"\n" % (variable, value)) 124 | 125 | 126 | if __name__ == "__main__": 127 | parser = argparse.ArgumentParser( 128 | description='Configs for setting up data source') 129 | parser.add_argument( 130 | '--dir', help="Path to unzipped messages directory", required=True) 131 | parser.add_argument( 132 | '--name', help="Your name in the format 'John Smith'", required=True) 133 | args = parser.parse_args() 134 | 135 | base_dir = args.dir 136 | my_name = args.name 137 | 138 | generate_friends(50) 139 | generate_groupchats() 140 | generate_name() 141 | -------------------------------------------------------------------------------- /experimental.py: -------------------------------------------------------------------------------- 1 | from friends import MY_NAME 2 | 3 | # Either these don't work or don't say anything useful 4 | 5 | 6 | def message_freq(messages, participant): 7 | # After a gap in talking, who initiates first? 8 | # Obvious problem is conversations can go on hiatus for a couple days 9 | gaps = [.01, 1, 2, 5, 10] 10 | tdelta_gaps = [datetime.timedelta(days=i) for i in gaps] 11 | 12 | print("Gap\tZaibo Count\t%s Count\t%s %%\t %s %%" % 13 | (participant, MY_NAME, participant)) 14 | for gap, gap_in_days in zip(tdelta_gaps, gaps): 15 | prev_msg_t = datetime_from_mtime(messages[-1]["timestamp_ms"]) 16 | counters = { 17 | MY_NAME: 0, 18 | participant: 0 # Other participant. We assume there is only one 19 | } 20 | for message in reversed(messages): 21 | curr_msg_t = datetime_from_mtime(message["timestamp_ms"]) 22 | sender = message["sender_name"] 23 | t_delta = curr_msg_t - prev_msg_t 24 | if t_delta > gap: 25 | counters[sender] += 1 26 | prev_msg_t = curr_msg_t 27 | total_count = sum(counters.values()) + 1 28 | print("%f\t%d\t%d\t%f\t%f" % (gap_in_days, 29 | counters[MY_NAME], 30 | counters[participant], 31 | counters[MY_NAME]/total_count, 32 | counters[participant]/total_count)) 33 | 34 | 35 | def generate_normalization(messages): 36 | counters = {} 37 | for message in messages: 38 | sender = message["sender_name"] 39 | counters[sender] = 1 if sender not in counters else counters[sender] + 1 40 | 41 | 42 | def average_spread(paths=friends.ALL_FRIEND_PATHS): 43 | # Experimental 44 | paths = paths[:20] 45 | stats = ["Characters", "Words", "Messages", "Clusters"] 46 | all_spreads = [] 47 | all_zbo = [] 48 | for path in paths: 49 | message_json = get_json(path) 50 | messages = message_json.get("messages", []) 51 | participant = message_json.get("participants")[0] 52 | data = get_all_stats(messages) 53 | 54 | spreads = [] 55 | zbo = [] 56 | for small_stat, big_stat in combinations(stats, 2): 57 | me = friends.MY_NAME 58 | other = [name for name in data["Characters"]["Month"] 59 | if name != "total" and name != friends.MY_NAME][0] 60 | sender_averages = ( 61 | sum(data[small_stat]["Year"][me].values()) / 62 | sum(data[big_stat]["Year"][me].values()), 63 | sum(data[small_stat]["Year"][other].values()) / 64 | sum(data[big_stat]["Year"][other].values()) 65 | ) 66 | # spread = sender_averages[0]/sender_averages[1] 67 | zbo.append(sender_averages[0]) 68 | spread = sender_averages[0]-sender_averages[1] 69 | spreads.append(spread) 70 | all_spreads.append([other, *spreads]) 71 | all_zbo.append([*zbo]) 72 | inspect = 3 73 | all_spreads.sort(key=lambda x: x[inspect], reverse=True) 74 | # print(tabulate(all_spreads, headers=["Name", *["%s per %s" % combo for combo in combinations(stats, 2)]])) 75 | # bar = plt.hist([x[inspect] for x in all_spreads], 8) 76 | 77 | mean_stdev = [] 78 | combos = ["%s per %s" % x for x in list(combinations(stats, 2))] 79 | zbo_stats = [] 80 | for i in range(1, 7): 81 | avg = np.average([x[i] for x in all_spreads]) 82 | stdev = np.std([x[i] for x in all_spreads]) 83 | mean_stdev.append([combos[i-1], avg, stdev]) 84 | 85 | avg = np.average([x[i-1] for x in all_zbo]) 86 | stdev = np.std([x[i-1] for x in all_zbo]) 87 | zbo_stats.append([combos[i-1], avg, stdev]) 88 | # print("%s avg: %f\tstdev: %f" % (combos[i-1], avg, stdev)) 89 | print(tabulate(mean_stdev, headers=[ 90 | "Zaibo to other ratio", "Average", "STDEV"])) 91 | print("==============================================================") 92 | print(tabulate(zbo_stats, headers=["Zaibo stats", "Average", "STDEV"])) 93 | 94 | 95 | def average_response_time(message_json): 96 | # Confounding data: need to know when a conversation ends 97 | participant = message_json.get("participants")[0] 98 | messages = message_json.get("messages") 99 | data = defaultdict(lambda: defaultdict(int)) 100 | 101 | first_message = messages[-1] 102 | prev_msg_t = datetime.datetime.fromtimestamp(first_message["timestamp_ms"]) 103 | prev_sender = first_message["sender_name"] 104 | for message in reversed(messages): 105 | curr_msg_t = datetime.datetime.fromtimestamp(message["timestamp_ms"]) 106 | curr_sender = message["sender_name"] 107 | if curr_sender != prev_sender: 108 | t_delta = curr_msg_t - prev_msg_t 109 | data[curr_sender]["response_time"] += t_delta.total_seconds() 110 | data[curr_sender]["responses"] += 1 111 | prev_msg_t = curr_msg_t 112 | prev_sender = curr_sender 113 | 114 | res = [] 115 | for k, v in data.items(): 116 | if k == friends.MY_NAME: 117 | k = "%s + %s" % (k, participant) 118 | res.append([k, v["response_time"]//60/v["responses"]]) 119 | return res 120 | # for k, v in data.items(): 121 | # print(k, v["response_time"]//60/v["responses"]) 122 | -------------------------------------------------------------------------------- /group_message_analysis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from collections import defaultdict 4 | from pprint import pprint 5 | 6 | import matplotlib.pyplot as plt 7 | from tabulate import tabulate 8 | 9 | from helpers import get_json 10 | from name_hash import NameHasher 11 | import friends 12 | 13 | 14 | ANONYMOUS = False 15 | nh = NameHasher() 16 | 17 | 18 | def main(path): 19 | message_json = get_json(path) 20 | messages = message_json.get("messages", []) 21 | # groupchat_message_stats(messages) 22 | karma_stats(messages) 23 | 24 | 25 | def karma_stats(messages): 26 | """ 27 | Creates dictionary of counter + cluster + message + links data of all members of a group chat 28 | and displays data as table and pie chart 29 | """ 30 | name_map = { 31 | "zaibo": 0, 32 | "zaibo wang": 0, 33 | "rishi": 1, 34 | "rishi tripathy": 1, 35 | "eric": 2, 36 | "eric li": 2, 37 | "jaidev": 3, 38 | "jaidev phadke": 3, 39 | } 40 | # matrix = [["-", "Z", "R", "E", "J"], ["Z", 0, 0, 0, 0], [ 41 | # "R", 0, 0, 0, 0], ["E", 0, 0, 0, 0], ["J", 0, 0, 0, 0]] 42 | matrix = [["-", "Z", "R", "E", "J"], ["Z", (0, 0), (0, 0), (0, 0), (0, 0)], [ 43 | "R", (0, 0), (0, 0), (0, 0), (0, 0)], ["E", (0, 0), (0, 0), (0, 0), (0, 0)], ["J", (0, 0), (0, 0), (0, 0), (0, 0)]] 44 | events = {} 45 | counters = defaultdict(int) 46 | for message in messages: 47 | karma_re = r'(?i)(\-\-|\+\+)' 48 | # karma_re = r'(?i)(jaidev|jaidev phadke|zaibo|zaibo wang|rishi|rishi tripathy|eric|eric li)( ?)(\-\-|\+\+)' 49 | sender = message.get("sender_name", None) 50 | timestamp = message.get("timestamp_ms", None) 51 | content = message.get("content", "") 52 | if content: 53 | regex = re.findall(karma_re, content) 54 | if regex: 55 | print(message) 56 | # for receiver, _, inc in regex: 57 | # receiver_val = name_map[receiver.lower()] 58 | # sender_val = name_map[sender.lower()] 59 | # if receiver_val == 0 and 1 == sender_val: 60 | # print(message) 61 | # if inc == "++": 62 | # pos, neg = matrix[sender_val+1][receiver_val+1] 63 | # matrix[sender_val+1][receiver_val+1] = (pos+1, neg) 64 | # elif inc == "--": 65 | # pos, neg = matrix[sender_val+1][receiver_val+1] 66 | # matrix[sender_val+1][receiver_val+1] = (pos, neg-1) 67 | # print('\n'.join([str(row) for row in matrix])) 68 | # print(''.join('{:5}'.format(x) for x in ["-", "Z", "R", "E", "J"])) 69 | # print('\n'.join([''.join(['{:4}'.format(item) for item in row]) 70 | # for row in matrix[1:]])) 71 | 72 | 73 | def groupchat_message_stats(messages): 74 | """ 75 | Creates dictionary of counter + cluster + message + links data of all members of a group chat 76 | and displays data as table and pie chart 77 | 78 | Example 79 | counters = { 80 | "characters": { 81 | "Person 1": 100, 82 | "Person 2": 50 83 | }, 84 | "clusters": { 85 | "Person 1": 100, 86 | "Person 2": 50 87 | }, 88 | "messages": { 89 | "Person 1": 200, 90 | "Person 2": 100 91 | } 92 | } 93 | """ 94 | link_re = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' 95 | counters = defaultdict(lambda: defaultdict(int)) 96 | prev_sender = messages[-1]["sender_name"] 97 | for message in messages: 98 | sender = message["sender_name"] 99 | 100 | if ANONYMOUS: 101 | sender = nh.hash_by_name(sender) 102 | 103 | content = message.get("content", "") 104 | 105 | # Aggregate characters 106 | chars = len(content) 107 | counters["characters"][sender] = chars if not counters["characters"].get( 108 | sender) else counters["characters"].get(sender) + chars 109 | 110 | # Aggregate messages 111 | counters["messages"][sender] = 1 if not counters["messages"].get( 112 | sender) else counters["messages"].get(sender) + 1 113 | 114 | # Aggregate clusters 115 | if sender != prev_sender: 116 | counters["clusters"][sender] = 1 if not counters["clusters"].get( 117 | sender) else counters["clusters"].get(sender) + 1 118 | 119 | # Aggregate links 120 | num_links = len(re.findall(link_re, content)) 121 | counters["links"][sender] += num_links 122 | 123 | prev_sender = sender 124 | 125 | total_clusters = sum([v for k, v in counters["clusters"].items()]) 126 | total_messages = sum([v for k, v in counters["messages"].items()]) 127 | total_characters = sum([v for k, v in counters["characters"].items()]) 128 | total_links = sum([v for k, v in counters["links"].items()]) 129 | 130 | # Assemble data in print_list format to print out as table 131 | print_list = [] 132 | for messages, chars, clusters, links in zip(counters["messages"].items(), 133 | counters["characters"].items(), 134 | counters["clusters"].items(), 135 | counters["links"].items()): 136 | # messages, chars, clusters are a tuple of (, ) 137 | # Ex: ("Zaibo Wang", 500) 138 | name = messages[0] 139 | print_list.append([name, 140 | float("%.3f" % (chars[1]/total_characters)), 141 | float("%.3f" % (messages[1]/total_messages)), 142 | float("%.3f" % (clusters[1]/total_clusters)), 143 | float("%.3f" % (links[1]/total_links))]) 144 | print_list.sort(key=lambda x: x[1], reverse=True) 145 | print(tabulate(print_list, headers=[ 146 | "Name", "% characters", "% messages", "% clusters", "% links"])) 147 | 148 | # Generate pie charts 149 | labels = [x[0] for x in print_list] 150 | characters = [x[1] for x in print_list] 151 | messages = [x[2] for x in print_list] 152 | clusters = [x[3] for x in print_list] 153 | links = [x[4] for x in print_list] 154 | 155 | ax1 = plt.subplot(411) 156 | ax1.pie(characters, labels=labels, autopct='%1.2f%%', 157 | startangle=90) 158 | ax1.axis('equal') 159 | plt.title("Characters", fontsize=20) 160 | 161 | ax1 = plt.subplot(412) 162 | ax1.pie(clusters, labels=labels, autopct='%1.2f%%', 163 | startangle=90) 164 | ax1.axis('equal') 165 | plt.title("Clusters", fontsize=20) 166 | 167 | ax1 = plt.subplot(413) 168 | ax1.pie(messages, labels=labels, autopct='%1.2f%%', 169 | startangle=90) 170 | ax1.axis('equal') 171 | plt.title("Messages", fontsize=20) 172 | 173 | ax1 = plt.subplot(414) 174 | ax1.pie(links, labels=labels, autopct='%1.2f%%', 175 | startangle=90) 176 | ax1.axis('equal') 177 | plt.title("Links", fontsize=20) 178 | 179 | plt.show() 180 | 181 | 182 | if __name__ == "__main__": 183 | # path = friends.situation_room 184 | # path = friends.eggplant 185 | path = "./situationroom_7_29_18.json" 186 | main(path) 187 | -------------------------------------------------------------------------------- /name_dump.txt: -------------------------------------------------------------------------------- 1 | Hayden Branch 2 | Erick Vang 3 | Santiago Barnett 4 | Martha Berger 5 | Dane Macdonald 6 | Denise Casey 7 | Mathew Hunter 8 | Ciara David 9 | Kenna York 10 | Ulises Montoya 11 | Evie Strong 12 | Noel Lewis 13 | Elliott Smith 14 | Ryan Dennis 15 | Aubrey Hull 16 | Micah Pierce 17 | Lucia Church 18 | Lauren Riley 19 | Moshe Davies 20 | Brennen Lindsey 21 | Braeden Boyd 22 | Vincent Zamora 23 | Catherine Hanson 24 | Terrance Buchanan 25 | Donte Mccall 26 | Jorge Lucero 27 | Haleigh Moon 28 | Shyanne Hampton 29 | Pranav Dawson 30 | Diego Torres 31 | Jaycee Schwartz 32 | Amirah Sharp 33 | Camilla Wheeler 34 | Kamryn Malone 35 | Chelsea Warren 36 | Maxwell Robinson 37 | Adolfo Christensen 38 | Skylar Woods 39 | Scarlet Holmes 40 | Anderson Brennan 41 | Stephanie Moreno 42 | Shaylee Garrison 43 | Kendal Freeman 44 | Ariel Floyd 45 | Isabella Clarke 46 | Broderick Kelly 47 | Tara Hutchinson 48 | Olive Baird 49 | Kiara Carrillo 50 | Nyasia Noble 51 | Ashly Davis 52 | Nicholas Clay 53 | Dereon Cuevas 54 | Emanuel Cruz 55 | Reagan Leon 56 | Matthias Huang 57 | Tomas Harvey 58 | Cailyn Rhodes 59 | Brenton Carson 60 | Amiyah Bonilla 61 | Sonny Kaufman 62 | Junior Maynard 63 | Cayden Stokes 64 | Brielle Knox 65 | Kadence Cook 66 | Alden Webb 67 | Belen Orozco 68 | Jair Stephens 69 | Sage Pineda 70 | Lennon Richard 71 | Bruce Mcclure 72 | Bo Chapman 73 | Emily Andrews 74 | Audrey Ryan 75 | Zoie Cline 76 | Amber Cummings 77 | Madison Ortega 78 | Myla Cardenas 79 | Penelope Winters 80 | Brooklynn Baxter 81 | Braylon Orr 82 | Hassan Francis 83 | Nikhil Mckay 84 | Amy Barnes 85 | Jackson Ramsey 86 | Denisse Henry 87 | Alena Krueger 88 | Donavan Mendoza 89 | Mathias Sims 90 | Kaila Flores 91 | Nataly Pennington 92 | Keegan Olsen 93 | Cyrus Newman 94 | Marvin Benitez 95 | Kendrick Bradford 96 | Dante Shelton 97 | Rowan Dillon 98 | Raegan Willis 99 | Christian Choi 100 | Rodrigo Conner 101 | Ricky Koch 102 | Leon Phelps 103 | Kiley Potter 104 | Benjamin Simon 105 | Jamya Jensen 106 | Lesly Summers 107 | Charlie Howell 108 | Leslie Cox 109 | Felicity Bond 110 | Agustin Jimenez 111 | Mariyah Weeks 112 | Christina Vazquez 113 | Kaylen Harper 114 | Karsyn Yang 115 | Keenan Bell 116 | Coby Morgan 117 | Adyson Hoover 118 | Ella Delgado 119 | Chaim Gregory 120 | Emilie Norris 121 | Salvador Bass 122 | Guadalupe Watson 123 | Bianca Ruiz 124 | Carissa Berg 125 | Luciana Rubio 126 | Grace Hamilton 127 | Zion Suarez 128 | Alexandria Leblanc 129 | Jamar Hendrix 130 | Yasmine Landry 131 | Kaiya Lester 132 | Amelie Mack 133 | Omar Tran 134 | Kymani Hurst 135 | Abigail Larsen 136 | Jamari Bailey 137 | Andy Andersen 138 | Reina Benson 139 | Allen Blankenship 140 | Camron Brock 141 | Jesse Bentley 142 | Laura Stanton 143 | Camden Conway 144 | Franco Mcfarland 145 | Jeffrey Mccarthy 146 | Cloe Garza 147 | Brynlee Cooper 148 | Jaylah Mata 149 | Demarion Morales 150 | Ishaan Butler 151 | Ty Fuentes 152 | Ezequiel Chen 153 | Savanah Mooney 154 | Makenzie Chung 155 | April Gutierrez 156 | Santos Moore 157 | Abbigail Duarte 158 | Emiliano Pratt 159 | Alma Frey 160 | Cullen Lyons 161 | Reagan Mason 162 | Skye Sanchez 163 | Christine Sosa 164 | August Salazar 165 | Abagail Shields 166 | Tamara Obrien 167 | Prince Ferguson 168 | Judah Serrano 169 | Alondra Neal 170 | Raina Benjamin 171 | Dennis Hays 172 | Miracle Mullins 173 | Holly Gates 174 | Ean Spence 175 | Jonathon Good 176 | Duncan Bowers 177 | Aaden Acosta 178 | Lana Hartman 179 | Lee Bennett 180 | Kareem Shannon 181 | Spencer Horne 182 | Esperanza Osborne 183 | Rigoberto Scott 184 | Aleah Osborn 185 | Ashlynn Hale 186 | Kenneth Forbes 187 | Rihanna Ball 188 | Morgan Bowen 189 | Koen Alvarado 190 | Abel Snyder 191 | Karina Vargas 192 | Glenn Mathews 193 | Averie Mcintosh 194 | Madelyn Wilkerson 195 | Kaylee Santos 196 | Zackary Booth 197 | Kadin Fuller 198 | Mikaela Bartlett 199 | Markus Fleming 200 | Zayden Schmidt 201 | Reese Ho 202 | Kassidy Cole 203 | Sidney Finley 204 | Braden Guerra 205 | Trystan Ochoa 206 | Leonel Sandoval 207 | Arturo Brady 208 | Sloane Mills 209 | Leyla Bishop 210 | Izabelle Sawyer 211 | Finnegan Kane 212 | Areli Schroeder 213 | Elyse Greene 214 | Yaretzi Chang 215 | Angel Rowe 216 | Ari Frank 217 | Jason Klein 218 | Lila Haas 219 | Jayden Camacho 220 | Ryleigh Herman 221 | Ellen Michael 222 | Hayden Oconnor 223 | Julia Roberts 224 | Lillianna Howe 225 | Sterling Griffin 226 | Andreas Middleton 227 | Fernanda Wilson 228 | Bryce Johns 229 | Branden Barry 230 | Darian Bender 231 | Janet Sheppard 232 | Alexzander Proctor 233 | Isaias Espinoza 234 | Tatiana May 235 | Johnathon Hess 236 | Camryn Livingston 237 | Jakob Sherman 238 | Eliezer Curry 239 | Jorden Villa 240 | Keely Oconnell 241 | Vance Wallace 242 | Brylee Braun 243 | Mollie Shea 244 | Janiya Castro 245 | Tania Jefferson 246 | Daniel Mcconnell 247 | Meredith Ellison 248 | Gauge Bush 249 | Elisa Hines 250 | Talan Martin 251 | Aubree Hickman 252 | Adam Clark 253 | Edith Horton 254 | Austin Mcknight 255 | Anthony Robles 256 | Ximena Baldwin 257 | Jace Carpenter 258 | Gunnar Wade 259 | Alivia Guerrero 260 | Katelynn Wells 261 | Taylor Bernard 262 | Carson Potts 263 | Wesley Goodman 264 | Brenda Savage 265 | Christopher Williams 266 | Ansley Thomas 267 | Nathanial Hinton 268 | Rubi Copeland 269 | Jakobe Owens 270 | Alvin Olson 271 | Kaliyah Coffey 272 | Annabella Dickson 273 | Jean Mitchell 274 | Konner Kirk 275 | Jaiden Mcintyre 276 | India Matthews 277 | Olivia Nguyen 278 | Paula Lamb 279 | Fiona Estrada 280 | Joslyn Poole 281 | Alina Preston 282 | Brayden Blevins 283 | Luna Mcpherson 284 | Eric Prince 285 | Chance Cunningham 286 | Jacqueline Jennings 287 | Cortez Chan 288 | Alisha Cordova 289 | Cordell Joyce 290 | Liana Atkinson 291 | Tyler Lowe 292 | Ainsley Harris 293 | Bernard Ali 294 | Britney Rios 295 | Robert Walker 296 | Ray Booker 297 | Elias Cortez 298 | Korbin Weber 299 | Kolten Miranda 300 | Tanner Mcmillan 301 | Ally Mccormick 302 | Waylon Reese 303 | Desiree Short 304 | Will Graves 305 | Corey Paul 306 | Jaron Bullock 307 | Mariam Sloan 308 | Albert Wiley 309 | Jerry Richmond 310 | Dustin Pace 311 | Kolton Houston 312 | Chana Harmon 313 | Josue Wang 314 | London Gould 315 | Russell Gross 316 | Eduardo Guzman 317 | Evan Watkins 318 | Ariel Mejia 319 | Katrina Gomez 320 | Dominick Martinez 321 | Billy Rowland 322 | Milagros Fletcher 323 | Santino Mercer 324 | Diamond White 325 | Jaelyn Schultz 326 | Zion Hanna 327 | Gwendolyn Giles 328 | Aaliyah Gray 329 | Derek Ewing 330 | Briley Ford 331 | Raven Figueroa 332 | Jadyn Kim 333 | Darnell Rojas 334 | Viviana Ferrell 335 | Jamal Patel 336 | Elena Luna 337 | Reed Navarro 338 | Gerald Bruce 339 | Caitlyn Becker 340 | Averi Kennedy 341 | Ahmed Sullivan 342 | Chace Jacobson 343 | Kristian Harrison 344 | Marques Pollard 345 | Sammy Downs 346 | Caitlin Hicks 347 | Theodore Tate 348 | Callie Park 349 | Zayne Norman 350 | Enrique Patton 351 | Breanna Stout 352 | Hudson Herrera 353 | Daniela Mcgrath 354 | Sharon Reed 355 | Kaylie Massey 356 | Brycen Munoz 357 | Alan Lang 358 | Kaleb Beck 359 | Easton Gonzalez 360 | Callum Kemp 361 | Clare Mahoney 362 | Sofia Novak 363 | Andrew Lawson 364 | Zaire Trevino 365 | Jennifer Ponce 366 | Yasmin Schaefer 367 | Derrick Pugh 368 | Ana Morrison 369 | Bridger Watts 370 | Angel Flynn 371 | Ashlee Mullen 372 | Ramiro Curtis 373 | Charity English 374 | Nathaly Reeves 375 | Lawson Mercado 376 | Maren Evans 377 | Nathan Wilkinson 378 | Samir Banks 379 | Taylor Ellis 380 | Geovanni Cochran 381 | Micheal Bautista 382 | Emmett Blair 383 | Bradley Chambers 384 | Carsen Dunlap 385 | Jadyn Wiggins 386 | Macey Valenzuela 387 | Jovany Contreras 388 | Sarahi Kline 389 | Tabitha Tapia 390 | Presley Santiago 391 | Lucille Mclaughlin 392 | Amira Shepherd 393 | Alessandra Douglas 394 | Adan Joseph 395 | Briana Monroe 396 | Kenya Lee 397 | Joe Day 398 | Jayleen Mcbride 399 | Anastasia Sanford 400 | Madyson Davidson 401 | Chandler Greer 402 | Donna Calderon 403 | Alani Perry 404 | Marley Lam 405 | Craig Bean 406 | Rebecca King 407 | Jensen Johnson 408 | Adrien Owen 409 | Micah Mays 410 | Cora Everett 411 | Jessie Murillo 412 | Aisha Wagner 413 | Helen Chase 414 | Brodie Hebert 415 | Ramon Ortiz 416 | Walker Myers 417 | Madalyn Ballard 418 | Toby Stevens 419 | Elise Wall 420 | Byron Thornton 421 | Kaden Merritt 422 | Romeo Fischer 423 | Amir Weiss 424 | Rhett Parrish 425 | Parker Valdez 426 | Peter Best 427 | Deacon Ritter 428 | Riley Hayden 429 | Kennedy Leonard 430 | Peyton Charles 431 | Jimmy Maldonado 432 | Akira Tyler 433 | Amaya Franklin 434 | Reginald Kaiser 435 | Mateo Simmons 436 | Gage Anthony 437 | Jane Travis 438 | Semaj Terry 439 | Roselyn Whitaker 440 | Kendall Vega 441 | Matteo Hobbs 442 | Brittany Phillips 443 | Marie Castillo 444 | Dominique Rodgers 445 | Draven Newton 446 | Carlo Andrade 447 | Annalise Key 448 | Desmond Brooks 449 | Paloma Adkins 450 | Lorelei Jackson 451 | Walter Little 452 | Joselyn Roy 453 | Sarai Donaldson 454 | Brooklyn Steele 455 | Allyson Russell 456 | Kianna Cross 457 | Yandel Jenkins 458 | Zain Velez 459 | Donald Parsons 460 | Griffin Sexton 461 | Hadley Melendez 462 | Mohammad Kent 463 | Devyn Salas 464 | Howard Glover 465 | Jacoby Krause 466 | Alonso Nash 467 | Javion Kirby 468 | Wade Cobb 469 | Armando Stephenson 470 | Bridget Lara 471 | Phoenix Mcmahon 472 | Kamren Skinner 473 | Drew Stewart 474 | Kristen Hill 475 | Delilah Allen 476 | Colin Valencia 477 | Antoine Garcia 478 | Rylee Holt 479 | Memphis Cannon 480 | Bruno Edwards 481 | Shaniya Spencer 482 | Jamiya Hawkins 483 | Patricia Rich 484 | Aleena Reid 485 | Lyric Shah 486 | Vivian Duffy 487 | Devin Gay 488 | Karissa Hansen 489 | Halle Gentry 490 | Quintin Schneider 491 | Alberto Cabrera 492 | Journey Hudson 493 | Adalyn Wilkins 494 | Hallie Mckenzie 495 | Julianna Miller 496 | Dakota Humphrey 497 | Lia Carr 498 | Amiya Fernandez 499 | Kamari Collier 500 | Alfred Ray -------------------------------------------------------------------------------- /private_message_analysis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import re 4 | from pprint import pprint 5 | from tabulate import tabulate 6 | from collections import defaultdict 7 | from itertools import combinations 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | from matplotlib.dates import date2num 12 | 13 | import friends 14 | from name_hash import NameHasher 15 | from helpers import get_json, bucket_datetime, time_format, width_dict 16 | 17 | nh = NameHasher() 18 | ANONYMOUS = False # We can make the data anonymous by hashing all the names except our own 19 | 20 | 21 | def generate_averages(paths=friends.ALL_FRIEND_PATHS): 22 | """ Analyze combinations of stats such as "Characters per Words" across all friends in paths""" 23 | stats = ["Characters", "Words", "Messages", "Clusters"] 24 | average_stats = [] 25 | for path in paths: 26 | message_json = get_json(path) 27 | messages = message_json.get("messages", []) 28 | participant = message_json.get("participants")[0]['name'] 29 | data = get_all_stats(messages) 30 | 31 | for sender in data["Characters"]["Month"]: 32 | if sender == "total": 33 | continue 34 | sender_averages = [] 35 | for small_stat, big_stat in combinations(stats, 2): 36 | sender_averages.append( 37 | sum(data[small_stat]["Year"][sender].values())/sum(data[big_stat]["Year"][sender].values())) 38 | if sender == friends.MY_NAME: 39 | if ANONYMOUS: 40 | sender = "%s + %s" % (friends.MY_NAME, 41 | nh.hash_by_name(participant)) 42 | else: 43 | sender = "%s + %s" % (friends.MY_NAME, participant) 44 | average_stats.append([sender, *sender_averages]) 45 | average_stats.sort(key=lambda x: x[3], reverse=True) 46 | 47 | print(tabulate(average_stats, headers=[ 48 | "Name", *["%s per %s" % combo for combo in combinations(stats, 2)]])) 49 | 50 | 51 | def get_all_stats(messages): 52 | """ 53 | Given 1 on 1 messages, generate stats over periods 54 | 55 | Supported stats: 56 | "Characters": total characters 57 | "Messages": total times enter is pressed 58 | "Clusters": all messages sent before being interupted by other participant is one cluster 59 | "Words": Naively defined as length of space separated message 60 | 61 | data is a four layer dictionary 62 | Stat -> Period -> name -> datetime.datetime -> value 63 | data returns a "core data structure" given a Stat and Period key: 64 | { 65 | "name1": { 66 | datetime.datetime: stat_val1 67 | }, 68 | "name2": { 69 | datetime.datetime: stat_val2 70 | }, 71 | "total": { 72 | datetime.datetime: stat_val1+stat_val2 73 | }, 74 | } 75 | Ex: data["Messages"]["Day"] gives daily total message statistic 76 | """ 77 | periods = ["Year", "Month", "Day"] 78 | stats = ["Characters", "Messages", "Clusters", "Words"] 79 | 80 | # Create a four-layered defaultdict with default int leaf 81 | data = defaultdict(lambda: defaultdict( 82 | lambda: defaultdict(lambda: defaultdict(int)))) 83 | 84 | prev_sender = None 85 | for message in reversed(messages): 86 | timestamp = datetime.datetime.fromtimestamp( 87 | message["timestamp_ms"]/1000) 88 | sender_name = message["sender_name"] 89 | if ANONYMOUS and sender_name != friends.MY_NAME: 90 | sender_name = nh.hash_by_name(sender_name) 91 | content = message.get("content", "") 92 | 93 | for period in periods: 94 | m_time = bucket_datetime(timestamp, period) 95 | 96 | # Aggregate for messages, characters, clusters, words 97 | for name in [sender_name, "total"]: 98 | data["Characters"][period][name][m_time] += len(content) 99 | data["Words"][period][name][m_time] += len( 100 | [i for i in content.split(" ") if ".com" not in i]) 101 | # data["Words"][period][name][m_time] += len(content.split(" ")) 102 | data["Messages"][period][name][m_time] += 1 103 | if sender_name != prev_sender: 104 | data["Clusters"][period][name][m_time] += 1 105 | 106 | prev_sender = sender_name 107 | 108 | return data 109 | 110 | 111 | def graph_stat(path=friends.BEST_FRIEND, stat="Messages", period="Year", avg=False): 112 | """ 113 | graph_stat wrapper that parses from a path 114 | """ 115 | message_json = get_json(path) 116 | messages = message_json.get("messages", []) 117 | 118 | data = get_all_stats(messages) 119 | _graph_stat(data, stat=stat, period=period, avg=avg) 120 | 121 | 122 | def _graph_stat(data, stat="Messages", period="Month", name="total", message_data=None, avg=False): 123 | """ 124 | The real graph stat function 125 | Graph parameterized stat from get_all_stats 126 | """ 127 | 128 | # Parse data and sort by dates 129 | if not message_data: 130 | message_data = data[stat][period][name] 131 | dates = date2num(list(message_data.keys())) 132 | counts = np.array(list(message_data.values())) 133 | dates, counts = zip(*sorted(zip(dates, counts))) 134 | 135 | if avg: 136 | dates, counts = list(dates), list(counts) 137 | new_counts = counts[:] 138 | for i in range(1, len(counts)-1): 139 | new_counts[i] = counts[i-1] + counts[i] + counts[i+1] 140 | counts = new_counts[1:-1] 141 | dates = dates[1:-1] 142 | 143 | ### BAR GRAPH ### 144 | bar = plt.bar(dates, counts, width=width_dict[period]) 145 | ax = plt.subplot(111) 146 | ax.xaxis_date() 147 | 148 | ### SCATTER PLOT ### 149 | # I think the bar graph displays data better 150 | # scatter = plt.plot_date(dates, counts, '.', label=name) 151 | # p1 = np.poly1d(np.polyfit(dates, counts, 10)) 152 | # best_fit_str = "%s best fit" % name 153 | # best_fit = plt.plot_date(dates, p1(dates), '--', label=best_fit_str) 154 | # plt.autoscale(True) 155 | # plt.grid(True) 156 | # plt.ylim(-100) 157 | # plt.legend() 158 | 159 | plt.ylabel('# of %s' % stat) 160 | plt.title("%s between %s per %s" % (stat, " and ".join( 161 | [i for i in data[stat][period].keys() if i != "total"]), period)) 162 | 163 | 164 | def top_n_stat(n=3, stat="Messages", period="Month", show_counts=False): 165 | """ 166 | Print top n stat'd person per period in a table 167 | """ 168 | res = defaultdict(list) 169 | 170 | for person, path in friends.ALL_FRIENDS: 171 | message_json = get_json(path) 172 | messages = message_json.get("messages", []) 173 | name = message_json.get("participants")[0] 174 | 175 | if ANONYMOUS: 176 | name = nh.hash_by_name(name) 177 | 178 | message_data = get_all_stats(messages)[stat][period]["total"] 179 | 180 | for date, count in message_data.items(): 181 | res[date].append((name, count)) 182 | 183 | # We want to sort by date 184 | res_list = sorted([[date, count_list] for date, count_list in res.items()]) 185 | 186 | table_data = [] 187 | 188 | for date, count_list in res_list[:]: 189 | # Format date by period 190 | date_str = date.strftime(time_format(period)) 191 | # Sort by count 192 | count_list.sort(key=lambda x: x[1], reverse=True) 193 | # Truncate to top n 194 | count_list = count_list[:n] 195 | if show_counts: 196 | name_and_counts = [] 197 | for name, count in count_list: 198 | name = name['name'] 199 | spaces = 30 - len(name) - len(str(count)) 200 | spaces_str = " "*spaces 201 | s = spaces_str.join([name, str(count)]) 202 | name_and_counts.append(s) 203 | table_data.append([date_str, *name_and_counts]) 204 | else: 205 | table_data.append( 206 | [date_str, *[name for name, count in count_list]]) 207 | print("Top %d Most %s per %s" % (n, stat, period)) 208 | print(tabulate(table_data, headers=[ 209 | period, *["#%d" % i for i in range(1, n+1)]])) 210 | 211 | # Attempt to use matplotlib for tables... ASCII seems better 212 | # TODO better table? 213 | # fig, ax = plt.subplots() 214 | # fig.patch.set_visible(False) 215 | # ax.axis('off') 216 | # ax.axis('tight') 217 | # col_labels = ["Month", *["#%d" % i for i in range(1, n+1)]] 218 | # table = plt.table(cellText=table_data, colWidths=[0.1] * (n+1), loc='center', colLabels=col_labels) 219 | 220 | # # Center the month column 221 | # cells = table.properties()["celld"] 222 | # for i in range(len(cells)//(n+1)): 223 | # cells[i, 0]._loc = 'center' 224 | # # Format table 225 | # table.set_fontsize(24) 226 | # table.scale(1.3, 1.1) 227 | 228 | # plt.title("Top %d %s Sent by per %s" % (n, stat, period)) 229 | 230 | 231 | def total_stat_sent(stat="Messages", period="Year"): 232 | """ 233 | Graph all of a stat sent by YOU 234 | """ 235 | res = defaultdict(int) 236 | 237 | for person, path in friends.ALL_FRIENDS: 238 | message_json = get_json(path) 239 | messages = message_json.get("messages", []) 240 | name = message_json.get("participants")[0] 241 | 242 | data = get_all_stats(messages) 243 | message_data = data[stat][period][friends.MY_NAME] 244 | 245 | for date, count in message_data.items(): 246 | res[date] += count 247 | 248 | res_list = sorted([(date, count) for date, count in res.items()]) 249 | dates = [elem[0] for elem in res_list[:-1]] 250 | counts = [elem[1] for elem in res_list[:-1]] 251 | 252 | bar = plt.bar(dates, counts, width=width_dict[period]) 253 | ax = plt.subplot(111) 254 | ax.xaxis_date() 255 | plt.ylabel('# of %s' % stat) 256 | plt.title("Total %s Sent %s per %s" % (stat, friends.MY_NAME, period)) 257 | 258 | 259 | def count_specific_words(WORDS, path=friends.BEST_FRIEND): 260 | """ 261 | Count frequency of WORDS between people in paths 262 | Should we normalization by message count per year? 263 | """ 264 | counters = defaultdict(lambda: defaultdict(int)) 265 | 266 | message_json = get_json(path) 267 | messages = message_json.get("messages", []) 268 | 269 | for keyword in WORDS: 270 | for message in messages: 271 | sender = message["sender_name"] 272 | if ANONYMOUS and sender != friends.MY_NAME: 273 | sender = nh.hash_by_name(sender) 274 | content = message.get("content", "") 275 | if "dick" in content: 276 | print(sender, content) 277 | count = content.lower().count(keyword) 278 | counters[keyword][sender] += count 279 | table = [] 280 | for keyword, participants in counters.items(): 281 | table.append([keyword, *participants.values()]) 282 | print(tabulate(table, headers=["Word", *participants.keys()])) 283 | 284 | 285 | def count_links(paths=friends.ALL_FRIEND_PATHS[:20]): 286 | """ 287 | Count links sent between friends 288 | """ 289 | table = [] 290 | link_re = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' 291 | for path in paths: 292 | message_json = get_json(path) 293 | messages = message_json.get("messages", []) 294 | participant = message_json.get("participants")[0]['name'] 295 | counters = defaultdict(int) 296 | 297 | for message in messages: 298 | sender = message["sender_name"] 299 | content = message.get("content", "") 300 | num_links = len(re.findall(link_re, content)) 301 | counters[sender] += num_links 302 | 303 | table.append([ 304 | participant, 305 | counters[friends.MY_NAME]/counters[participant], 306 | counters[friends.MY_NAME], 307 | counters[participant], 308 | counters[friends.MY_NAME] + counters[participant]]) 309 | table.sort(key=lambda x: x[1], reverse=True) 310 | 311 | if ANONYMOUS: 312 | for row in table: 313 | row[0] = nh.hash_by_name(row[0]) 314 | 315 | print(tabulate(table, headers=[ 316 | "Name", "Ratio of Links", "Sent by me", "Sent by other", "Total"])) 317 | avg = np.average([x[1] for x in table if x[2] > 50]) 318 | stdev = np.std([x[1] for x in table]) 319 | print("Average Ratio: %f" % avg) 320 | print("Ratio STDEV: %F" % stdev) 321 | 322 | 323 | if __name__ == "__main__": 324 | """ 325 | supported periods: "Month", "Year", "Day" 326 | 327 | Supported stats: 328 | "Characters": total characters 329 | "Messages": total times enter is pressed 330 | "Clusters": all messages sent before being interupted by other participant is one cluster 331 | "Words": Naively defined as length of space separated message 332 | """ 333 | graph_stat(friends.JAIDEV_PHADKE, stat="Characters", 334 | period="Month", avg=True) 335 | # top_n_stat(n=4, stat="Characters", period="Month", show_counts=True) 336 | # count_links(friends.ALL_FRIEND_PATHS[:20]) 337 | # generate_averages(friends.ALL_FRIEND_PATHS) 338 | # words = ["lol", "lool", "loool", "lmao", "haha", "hahaha", "hahahaha"] 339 | # count_specific_words(words, friends.BEST_FRIEND) 340 | # total_stat_sent(stat="Words", period="Year") 341 | 342 | plt.show(block=True) 343 | --------------------------------------------------------------------------------