├── .gitignore
├── Procfile
├── README.md
├── assets
├── fonts
│ └── Poppins-Medium.ttf
├── masks
│ └── walogo.jpg
└── stopwords
│ └── stop_words.pkl
├── main.py
├── requirements.txt
├── runtime.txt
└── src
├── interesting_search.py
├── utils.py
└── whatsapp_analyzer.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /env
2 | /notebooks
3 | /__pycache__
4 | /functions/__pycache__
5 | /.vscode
6 | /src/__pycache__
7 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: uvicorn main:app --host=0.0.0.0 --port=${PORT:-5000}
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WhatsApp Chat Analyzer API
2 | An API to analyse WhatsApp chats and generate insights
3 |
4 | ## API Docs
5 |
6 | https://wa-chat-analyzer.herokuapp.com/docs
7 |
8 | ## Implementation
9 | 
10 |
[#WhatsAppWrapped by OurChatStory.co](https://ourchatstory.co/)
11 |
12 | ## Features
13 |
14 | - Total chat count
15 | - Global chat percentile
16 | - Who texts first
17 | - Monthly analysis
18 | - Hourly analysis
19 | - Finding the trend
20 | - Longest gap
21 | - Word Cloud
22 | - Pie Chart for individual and group chats
23 |
24 | ## Contribution
25 |
26 | All communications will be managed via the issues section.
27 | To contribute, fork the repo and create a PR.
28 |
29 | - Put up suggestions and feature request as an issue.
30 |
31 | ## Maintainers
32 |
33 | - [Yajat Malhotra](https://www.github.com/iamyajat)
34 | - [Anshul Agarwala](https://www.github.com/anshulagx)
35 |
36 |
--------------------------------------------------------------------------------
/assets/fonts/Poppins-Medium.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamyajat/WhatsApp-Chat-Analyzer-API/a6844fd9b5723b7faa94c24d5f31374126cfdbba/assets/fonts/Poppins-Medium.ttf
--------------------------------------------------------------------------------
/assets/masks/walogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamyajat/WhatsApp-Chat-Analyzer-API/a6844fd9b5723b7faa94c24d5f31374126cfdbba/assets/masks/walogo.jpg
--------------------------------------------------------------------------------
/assets/stopwords/stop_words.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/iamyajat/WhatsApp-Chat-Analyzer-API/a6844fd9b5723b7faa94c24d5f31374126cfdbba/assets/stopwords/stop_words.pkl
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from re import split
2 | from fastapi import FastAPI, File, HTTPException, UploadFile
3 | import src.whatsapp_analyzer as wa
4 | import matplotlib.pyplot as plt
5 | from fastapi.middleware.cors import CORSMiddleware
6 | from starlette.exceptions import HTTPException as StarletteHTTPException
7 | from fastapi.responses import PlainTextResponse
8 | from starlette.responses import RedirectResponse
9 | from starlette.middleware.httpsredirect import HTTPSRedirectMiddleware
10 |
11 | import os
12 |
13 | env_name = os.getenv("ENV_NAME", "dev")
14 |
15 | if env_name == "prod":
16 | app = FastAPI(
17 | title="WhatsApp Analyzer",
18 | version="2.0",
19 | description="Get beautiful insights about your chats!",
20 | docs_url=None,
21 | redoc_url=None,
22 | )
23 | # app.add_middleware(HTTPSRedirectMiddleware)
24 | else:
25 | print("DEV MODE")
26 | app = FastAPI(
27 | title="WhatsApp Analyzer",
28 | version="2.0",
29 | description="Get beautiful insights about your chats!",
30 | )
31 | print("DOCS:", "http://127.0.0.1:8000/docs")
32 |
33 |
34 | app.add_middleware(
35 | CORSMiddleware,
36 | allow_origins=["http://localhost:3000", "https://ourchatstory.co"],
37 | allow_credentials=True,
38 | allow_methods=["*"],
39 | allow_headers=["*"],
40 | )
41 |
42 |
43 | # @app.exception_handler(StarletteHTTPException)
44 | # async def http_exception_handler(request, exc):
45 | # response = RedirectResponse(url="https://ourchatstory.co")
46 | # return response
47 |
48 |
49 | if env_name == "dev":
50 |
51 | @app.get("/")
52 | async def root():
53 | response = RedirectResponse(url="https://ourchatstory.co")
54 | return response
55 |
56 | @app.post("/chats_to_json")
57 | async def chats_to_json(file: UploadFile = File(...)):
58 | """Get your chats in JSON format. (Upload WhatsApp chats as .txt)"""
59 | extension = file.filename.split(".")[-1] in ("txt", "TXT")
60 | if not extension:
61 | raise HTTPException(
62 | status_code=400, detail="Please upload .txt files only!"
63 | )
64 | contents = await file.read()
65 | decoded_contents = contents.decode("utf-8")
66 | chats = split("\n", decoded_contents)
67 | resp = wa.chats_to_json(chats)
68 | return resp
69 |
70 | @app.post("/analyze")
71 | async def analyze(file: UploadFile = File(...)):
72 | """Get an analysis of your chats. (Upload WhatsApp chats as .txt)"""
73 | extension = file.filename.split(".")[-1] in ("txt", "TXT")
74 | if not extension:
75 | raise HTTPException(
76 | status_code=400, detail="Please upload .txt files only!"
77 | )
78 | contents = await file.read()
79 | decoded_contents = contents.decode("utf-8")
80 | chats = split("\n", decoded_contents)
81 | resp = wa.analyze(chats)
82 | return resp
83 |
84 | @app.post("/throwback")
85 | async def random(n: int = 10, file: UploadFile = File(...)):
86 | """Get a set of n old chats. (Upload WhatsApp chats as .txt)"""
87 | extension = file.filename.split(".")[-1] in ("txt", "TXT")
88 | if not extension:
89 | raise HTTPException(
90 | status_code=400, detail="Please upload .txt files only!"
91 | )
92 | contents = await file.read()
93 | decoded_contents = contents.decode("utf-8")
94 | chats = split("\n", decoded_contents)
95 | resp = wa.throwback_chats(chats, n)
96 | return resp
97 |
98 | @app.post("/wordcloud")
99 | async def word_cloud(file: UploadFile = File(...)):
100 | """Get a word cloud"""
101 | extension = file.filename.split(".")[-1] in ("txt", "TXT")
102 | if not extension:
103 | raise HTTPException(
104 | status_code=400, detail="Please upload .txt files only!"
105 | )
106 | contents = await file.read()
107 | decoded_contents = contents.decode("utf-8")
108 | chats = split("\n", decoded_contents)
109 | img = wa.get_word_cloud(chats)
110 | # buf = io.BytesIO()
111 | # plt.imsave(buf, img, format="PNG")
112 | # buf.seek(0)
113 | # return StreamingResponse(
114 | # buf,
115 | # media_type="image/jpeg",
116 | # headers={
117 | # "Content-Disposition": 'inline; filename="%s.jpg"' % (file.filename[:-4],)
118 | # },
119 | # )
120 | return img
121 |
122 |
123 | @app.post("/wrap")
124 | async def wrap(file: UploadFile = File(...)):
125 | """WhatsApp Wrap 2022"""
126 | file_type = file.filename.split(".")[-1]
127 | extension = file_type in ("txt", "TXT", "zip", "ZIP")
128 | print("\n\n---------------------------------------------")
129 | print(" " + file.filename.split(".")[0])
130 | print("---------------------------------------------")
131 | if not extension:
132 | raise HTTPException(
133 | status_code=400, detail="Please upload .txt or .zip files only!"
134 | )
135 | contents = await file.read()
136 | decoded_contents = ""
137 | if file_type == "zip" or file_type == "ZIP":
138 | try:
139 | decoded_contents = wa.extract_zip(contents)["_chat.txt"].decode("utf-8")
140 | except:
141 | raise HTTPException(
142 | status_code=400, detail="Zip file is corrupted! Please try again."
143 | )
144 | else:
145 | decoded_contents = contents.decode("utf-8")
146 | chats = split("\n", decoded_contents)
147 | resp = wa.wrap(chats)
148 | if resp != None:
149 | return resp
150 | else:
151 | raise HTTPException(
152 | status_code=400, detail="Not enough members or chats to analyze from 2022!"
153 | )
154 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | emoji==2.2.0
2 | fastapi==0.88.0
3 | matplotlib==3.6.2
4 | numpy==1.23.5
5 | pandas==1.5.2
6 | python_dateutil==2.8.2
7 | requests==2.28.1
8 | scipy==1.9.3
9 | starlette==0.22.0
10 | wordcloud==1.8.2.2
11 | uvicorn==0.20.0
12 | python-multipart==0.0.5
13 | zipfile36==0.1.3
14 |
--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.10.8
--------------------------------------------------------------------------------
/src/interesting_search.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import json
3 | import pandas as pd
4 | import numpy as np
5 |
6 |
7 | def count_per_minute(df):
8 | df["iso_time"] = df["time"].apply(lambda x: str(x)[:-3] + ":00")
9 | df["iso_time"] = pd.to_datetime(df["iso_time"], dayfirst=False)
10 | df["iso_time"] = df["iso_time"].astype(np.int64)
11 | df["iso_time"] = df["iso_time"].apply(lambda x: int(x / 60000000000))
12 | df = df.copy()
13 | df = df.groupby(["iso_time"]).count()
14 |
15 | df["time"] = df.index
16 |
17 | # drop everything else other than time and chat
18 | df = df[["time", "message"]]
19 | df = df.reset_index(drop=True)
20 |
21 | # rename chat to count
22 | df = df.rename(columns={"message": "count"})
23 |
24 | return df
25 |
26 |
27 | def interesting_search(original_df, count_df):
28 | # find longest streak
29 | streak = 0
30 | streak_start = 0
31 | streak_end = 0
32 | for i in range(count_df.shape[0] - 1):
33 | if count_df["time"][i + 1] - count_df["time"][i] <= 3:
34 | streak += 1
35 | else:
36 | if streak > streak_end - streak_start:
37 | streak_start = i - streak
38 | streak_end = i
39 | streak = 0
40 |
41 | # stats for streak
42 | longest_streak = streak_end - streak_start
43 | longest_streak_start = count_df["time"][streak_start] * 60000
44 | longest_streak_end = count_df["time"][streak_end] * 60000
45 | total_messages_sent = count_df["count"][streak_start:streak_end].sum()
46 | average_reply_time = (longest_streak_end - longest_streak_start) / (
47 | total_messages_sent * 1000
48 | )
49 |
50 | # convert longest streak to datetime
51 | longest_streak_start_dt = datetime.datetime.fromtimestamp(
52 | longest_streak_start / 1000
53 | ).strftime("%B %d, %Y")
54 | longest_streak_end_dt = datetime.datetime.fromtimestamp(
55 | longest_streak_end / 1000
56 | ).strftime("%B %d, %Y")
57 |
58 | # print the stats
59 | print("Longest streak:\t", longest_streak, "minutes")
60 | print("Total messages sent:\t", total_messages_sent, "messages")
61 | print("Longest streak date:\t", longest_streak_start_dt)
62 | # print("Longest streak end: ", longest_streak_end_dt)
63 | print("Average reply time:\t", round(average_reply_time, 2), "seconds")
64 |
65 | # find messages during longest streak
66 | # original_df = original_df[
67 | # original_df["iso_time"].isin(count_df["time"][streak_start:streak_end])
68 | # ]
69 |
70 | # odf_json_str = original_df[["time", "sender", "message"]].to_json(orient="records")
71 | # odf_json = json.loads(odf_json_str)
72 | # odf_json[-1]["message"] = odf_json[-1]["message"][:-1]
73 |
74 | # return as dictionary
75 | return {
76 | "streak_duration": longest_streak,
77 | "streak_start": int(longest_streak_start),
78 | "streak_end": int(longest_streak_end),
79 | "total_messages_sent": int(total_messages_sent),
80 | "average_reply_time": float(average_reply_time),
81 | # "messages_during_streak": odf_json,
82 | }
83 |
84 |
85 | def get_total_minutes(df):
86 | count_df = count_per_minute(df)
87 | total_mins = count_df.shape[0]
88 | return total_mins, count_df
89 |
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
1 | # parse a string in different country's dattime formates to return a datetime object
2 |
3 | import datetime
4 | import re
5 |
6 |
7 | def parse_datetime(
8 | s,
9 | dayfirst=True,
10 | ):
11 | # parse a string in different country's dattime formates to return a datetime object
12 | # s: string to be parsed
13 | # return: datetime object
14 | # example: parse_datetime('23/03/22, 5:59 pm')
15 | # example: parse_datetime('23.03.22, 5:59 am')
16 | # example: parse_datetime('23-03-22, 5:59 pm')
17 | # example: parse_datetime('23.03.22, 5.59 pm')
18 | # example: parse_datetime('23.03.22, 5:59')
19 | # example: parse_datetime('23-03-22, 5.59 pm')
20 | # example: parse_datetime('03-23-22, 5:59')
21 | # example: parse_datetime('04/07/20, 15:20:25')
22 | # example: parse_datetime('04/07/20, 5:20:25 pm')
23 | # example: parse_datetime('04/07/20, 5:20:25 am')
24 |
25 | # replace invisible charcaters with space
26 | s = re.sub(r"\s+", " ", s)
27 |
28 | # compile regex pattern to match date and time formats
29 | regex = re.compile(
30 | r"(\d{1,2})[\/\.-](\d{1,2})[\/\.-](\d{2,4})[ ,]*(\d{1,2})[:\.](\d{1,2})[:\.]?(\d{1,2})?[ ]?([ap]m)?"
31 | )
32 |
33 | # match the string with the regex pattern
34 | match = regex.match(s)
35 |
36 | # if match found
37 | if match:
38 | # extract the date and time from the string
39 | day = int(match.group(1))
40 | month = int(match.group(2))
41 | year = int(match.group(3))
42 | hour = int(match.group(4))
43 | minute = int(match.group(5))
44 | second = int(match.group(6)) if match.group(6) else 0
45 | ampm = match.group(7)
46 |
47 | # if year is 2 digit, convert it to 4 digit
48 | if year < 100:
49 | if year <= 22:
50 | year += 2000
51 | else:
52 | year += 1900
53 |
54 | # if ampm is present, convert 12 hour format to 24 hour format
55 | if ampm:
56 | if ampm == "pm" and hour < 12:
57 | hour += 12
58 | elif ampm == "am" and hour == 12:
59 | hour = 0
60 |
61 | # return the datetime object
62 | try:
63 | if dayfirst:
64 | return datetime.datetime(year, month, day, hour, minute, second)
65 | else:
66 | return datetime.datetime(year, day, month, hour, minute, second)
67 | except ValueError:
68 | return None
69 |
70 | # if match not found
71 | else:
72 | # return None
73 | return None
74 |
75 |
76 | # test the function
77 | # print("1.", parse_datetime("23/03/22, 5:59 pm"))
78 | # print("2.", parse_datetime("23.03.22, 5:59 am"))
79 | # print("3.", parse_datetime("23-03-22, 5:59 pm"))
80 | # print("4.", parse_datetime("23.03.22, 5.59 pm"))
81 | # print("5.", parse_datetime("23.03.22, 5:59"))
82 | # print("6.", parse_datetime("23-03-22, 5.59 pm"))
83 | # print("7.", parse_datetime("03-23-22, 5:59", dayfirst=False))
84 | # print("8.", parse_datetime("04/07/20, 15:20:25"))
85 | # print("9.", parse_datetime("07/23/20, 15:20:25", dayfirst=False))
86 | # print("10.", parse_datetime("07/23/20, 05:20:25 pm", dayfirst=False))
87 | # print("11.", parse_datetime("20/03/22, 4:25 pm", dayfirst=True))
88 |
89 |
90 | # output
91 | # 1. 2022-03-23 17:59:00
92 | # 2. 2022-03-23 05:59:00
93 | # 3. 2022-03-23 17:59:00
94 | # 4. 2022-03-23 17:59:00
95 | # 5. 2022-03-23 05:59:00
96 | # 6. 2022-03-23 17:59:00
97 | # 7. 2022-03-23 05:59:00
98 | # 8. 2020-07-04 15:20:25
99 | # 9. 2020-07-23 15:20:25
100 | # 10. 2020-07-23 17:20:25
101 | # 11. 2022-03-20 16:25:00
102 |
103 |
104 | # from an array of dates find out if the dates are dayfirst or monthfirst
105 | def check_dayfirst(dates):
106 | # example input: ['23/03/22, 5:59 pm', '23.03.22, 5:59 am', '23-03-22, 5:59 pm', '23.03.22, 5.59 pm', '23.03.22, 5:59', '23-03-22, 5.59 pm', '03-23-22, 5:59', '04/07/20, 15:20:25', '04/07/20, 5:20:25 pm', '04/07/20, 5:20:25 am']
107 | # example output: True
108 |
109 | # count the number of dates that are parsed correctly when dayfirst is True
110 | count_dayfirst = 0
111 |
112 | # count the number of dates that are parsed correctly when dayfirst is False
113 | count_monthfirst = 0
114 |
115 | # loop through all the dates
116 | for date in dates:
117 | # parse the date with dayfirst=True
118 | parsed_date_dayfirst = parse_datetime(date, dayfirst=True)
119 |
120 | # parse the date with dayfirst=False
121 | parsed_date_monthfirst = parse_datetime(date, dayfirst=False)
122 |
123 | # if date is parsed correctly with dayfirst=True, increment count_dayfirst
124 | if parsed_date_dayfirst:
125 | count_dayfirst += 1
126 |
127 | # if date is parsed correctly with dayfirst=False, increment count_monthfirst
128 | if parsed_date_monthfirst:
129 | count_monthfirst += 1
130 |
131 | # if count_dayfirst is less than count_monthfirst, return False
132 | if count_dayfirst < count_monthfirst:
133 | return False
134 |
135 | return True
136 |
137 |
138 | # test the function
139 | # print(check_dayfirst(["11/05/22, 5:59 pm", "12/05/22, 5:30 am", "13/05/22, 5:59 pm"]))
140 | # print(check_dayfirst(["05/11/22, 5:59 pm", "05/12/22, 5:30 am", "05/13/22, 5:59 pm"]))
--------------------------------------------------------------------------------
/src/whatsapp_analyzer.py:
--------------------------------------------------------------------------------
1 | from http.client import HTTPException
2 | import numpy as np
3 | import pandas as pd
4 | import re
5 | import json
6 | import datetime
7 | import random
8 | from wordcloud import WordCloud
9 | import emoji
10 | from collections import Counter
11 | from datetime import timedelta
12 | import time
13 | from dateutil import tz
14 | import pickle
15 | import requests
16 | import scipy.stats as st
17 | import base64
18 | import io
19 | from zipfile import ZipFile
20 | from src.interesting_search import get_total_minutes, interesting_search
21 | from src.utils import check_dayfirst, parse_datetime
22 |
23 | with open("./assets/stopwords/stop_words.pkl", "rb") as f:
24 | stopwords = pickle.load(f)
25 |
26 |
27 | def extract_zip(input_zip):
28 | input_zip = ZipFile(io.BytesIO(input_zip))
29 | return {name: input_zip.read(name) for name in input_zip.namelist()}
30 |
31 |
32 | def time_extractor(x, phone):
33 | y = 0
34 | if phone == "IOS":
35 | y = x.find("] ")
36 | return x[1:y]
37 | else:
38 | y = x.find(" - ")
39 | return x[:y]
40 |
41 |
42 | def chat_extractor(x, phone):
43 | y = 0
44 | if phone == "IOS":
45 | y = x.find("] ") + 2
46 | else:
47 | y = x.find(" - ") + 3
48 | return x[y:]
49 |
50 |
51 | def person_extractor(x):
52 | y = x.find(": ")
53 | if y != -1:
54 | return x[:y]
55 | else:
56 | return np.nan
57 |
58 |
59 | def message_extractor(x):
60 | y = x.find(": ") + 2
61 | s = ""
62 | if (y - 2) != -1:
63 | s = x[y:]
64 | else:
65 | s = x
66 | if (
67 | s == ""
68 | or s == "This message was deleted"
69 | or s == "You deleted this message"
70 | or s.find("image omitted") != -1
71 | or s.find("video omitted") != -1
72 | or s.find("audio omitted") != -1
73 | or s.find("file omitted") != -1
74 | or s.find("sticker omitted") != -1
75 | or s.find("gif omitted") != -1
76 | or s.find("voice omitted") != -1
77 | or s.find("contact omitted") != -1
78 | or s.find("location omitted") != -1
79 | or s.find("document omitted") != -1
80 | or s.find("") != -1
81 | ):
82 | return np.nan
83 | else:
84 | return s
85 |
86 |
87 | def parse_message(s, phone):
88 | time = time_extractor(s, phone)
89 | person_chat = chat_extractor(s, phone)
90 | person = person_extractor(person_chat)
91 | message = message_extractor(person_chat)
92 | return [time.lower(), person, message]
93 |
94 |
95 | def chats_to_df(chats):
96 | REGEX = {
97 | "IOS": "^[{1}[0-9]+[\/|\–|\-|\.][0-9]+[\/|\–|\-|\.][0-9]+,?\s[0-9]+[:|.][0-9]+[:|.][0-9]+.*$",
98 | "ANDROID": "^[0-9]+/[0-9]+/[0-9]+,?\s[0-9]+[:|.][0-9]+\s.*$",
99 | }
100 | new_chats = []
101 | phone = "ANDROID"
102 | if chats[0].find(" - ") == -1:
103 | phone = "IOS"
104 | c = 0
105 | i = 0
106 | while i < len(chats):
107 | chats[i] = chats[i].replace("\u200e", "").replace("\r", "")
108 | new_chats.append(chats[i])
109 | i += 1
110 | while i < len(chats) and not bool(re.search(REGEX[phone], chats[i])):
111 | new_chats[c] += "\n" + chats[i]
112 | i += 1
113 | c += 1
114 |
115 | wa_data = pd.DataFrame(new_chats, columns=["chats"])
116 | wa_data = wa_data["chats"].apply(parse_message, args=(phone,))
117 |
118 | wa_data = pd.DataFrame(wa_data.tolist(), columns=["time", "sender", "message"])
119 |
120 | wa_data.columns = ["time", "sender", "message"]
121 |
122 | dayfirst = check_dayfirst(list(wa_data["time"]))
123 | wa_data["time"] = wa_data["time"].apply(parse_datetime, args=(dayfirst,))
124 |
125 | return wa_data
126 |
127 |
128 | def members(df):
129 | chat_members = df["sender"].unique()
130 | chat_members = [x for x in chat_members if str(x) != "nan"]
131 | return chat_members
132 |
133 |
134 | def getYear2022(df):
135 | df = df[df["time"].dt.year == 2022]
136 | df.dropna(inplace=True)
137 | df.reset_index(drop=True, inplace=True)
138 | return df
139 |
140 |
141 | def extract_emojis(s):
142 | return "".join(c for c in s if c in emoji.EMOJI_DATA)
143 |
144 |
145 | def chats_to_json(chats):
146 | df = chats_to_df(chats)
147 | df_json_str = df.to_json(orient="records")
148 | df_json = json.loads(df_json_str)
149 | df_json[-1]["message"] = df_json[-1]["message"][:-1]
150 | return {"no_of_messages": len(df_json), "chats": df_json}
151 |
152 |
153 | def no_of_messages_per_member(df):
154 | count = df["sender"].value_counts().to_dict()
155 | count_list = [{"member": x, "count": count[x]} for x in count]
156 | return count_list
157 |
158 |
159 | def word_count(df):
160 | df = df.copy()
161 | df["no_of_words"] = df["message"].apply(lambda x: len(str(x).split()))
162 | df = df.reset_index(drop=True)
163 | members = df["sender"].unique()
164 | word_count = {member: 0 for member in members}
165 | for member in members:
166 | sub_df = df[df["sender"] == member]
167 | word_count[member] = sum(sub_df["no_of_words"])
168 | series = pd.Series(word_count)
169 | series = series.rename("Word Count")
170 | word_dict = dict(
171 | sorted(series.to_dict().items(), key=lambda item: item[1], reverse=True)
172 | )
173 | word_list = [{"member": x, "count": word_dict[x]} for x in word_dict]
174 | return word_list
175 |
176 |
177 | def chats_month(df):
178 | df["month"] = pd.DatetimeIndex(df["time"]).month
179 | m_count = df["month"].value_counts().to_dict()
180 | months = [
181 | "Jan",
182 | "Feb",
183 | "Mar",
184 | "Apr",
185 | "May",
186 | "Jun",
187 | "Jul",
188 | "Aug",
189 | "Sep",
190 | "Oct",
191 | "Nov",
192 | "Dec",
193 | ]
194 |
195 | month_count = [{"month": x, "count": 0} for x in months]
196 |
197 | for mc in m_count:
198 | month_count[mc - 1]["count"] = m_count[mc]
199 |
200 | month_df = pd.DataFrame(month_count)
201 | month_df["month_codes"] = pd.Series(range(1, 13))
202 | month_corr = month_df["month_codes"].corr(month_df["count"])
203 | return month_count, month_corr
204 |
205 |
206 | def chats_date(df):
207 | df["date"] = pd.DatetimeIndex(df["time"]).date
208 |
209 |
210 | def check_chat_date(df, date):
211 | return date in df["date"].unique()
212 |
213 |
214 | def convert_long_to_date(long_date):
215 | dt = datetime.datetime.fromtimestamp(long_date / 1000)
216 | date = datetime.date(dt.year, dt.month, dt.day)
217 | return date
218 |
219 |
220 | def get_chat_date_string(df, longest_break_start, longest_break_end):
221 | chats_date(df)
222 | result_chat_date = ""
223 | first_day = False
224 | # loop through all of the days in the year and check if there is a chat on that day
225 | for month in range(1, 13):
226 | for day in range(1, 32):
227 | try:
228 | d = datetime.date(2022, month, day)
229 | except ValueError:
230 | continue
231 | if (not first_day) and check_chat_date(df, d):
232 | first_day = True
233 |
234 | if first_day:
235 | start_gap = convert_long_to_date(longest_break_start)
236 | end_gap = convert_long_to_date(longest_break_end)
237 | if d > start_gap and d < end_gap:
238 | result_chat_date += "2"
239 | continue
240 | if check_chat_date(df, d):
241 | result_chat_date += "0"
242 | else:
243 | result_chat_date += "1"
244 |
245 | else:
246 | result_chat_date += "9"
247 |
248 | return result_chat_date
249 |
250 |
251 | def get_gender(name):
252 | URL = "https://api.genderize.io"
253 | PARAMS = {"name": name}
254 | r = requests.get(url=URL, params=PARAMS)
255 | data = r.json()
256 | return data
257 |
258 |
259 | def get_category(names):
260 | n = []
261 | for name in names:
262 | x = name.split()
263 | n.append(x[0].lower())
264 | data = get_gender(n)
265 | genders = []
266 | gb = {"boy": False, "girl": False}
267 | try:
268 | for d in data:
269 | genders.append(d["gender"])
270 | if d["gender"] == "male":
271 | gb["boy"] = True
272 | elif d["gender"] == "female":
273 | gb["girl"] = True
274 | except:
275 | print("Gender API calls over")
276 |
277 | return gb
278 |
279 |
280 | def most_used_emoji(df):
281 | emoji_list = df["message"].apply(extract_emojis).tolist()
282 | emoji_str = "".join(emoji_list)
283 | emoji_str = (
284 | emoji_str.replace("\U0001f3fb", "")
285 | .replace("\U0001f3fc", "")
286 | .replace("\U0001f3fd", "")
287 | .replace("\U0001f3fe", "")
288 | .replace("\U0001f3ff", "")
289 | )
290 | res = Counter(emoji_str)
291 | top_10 = res.most_common(10)
292 | top_10_list = [{"emoji": x[0], "count": x[1]} for x in top_10]
293 |
294 | return top_10_list
295 |
296 |
297 | def chats_hour(df):
298 | df["hour"] = pd.DatetimeIndex(df["time"]).hour
299 | h_count = df["hour"].value_counts().to_dict()
300 | hour_count = [{"hour": x, "count": 0} for x in range(24)]
301 | for hc in h_count:
302 | hour_count[hc]["count"] = h_count[hc]
303 | return hour_count
304 |
305 |
306 | def get_time_diff(df):
307 | df["time_diff"] = df["time"].diff()
308 | return df
309 |
310 |
311 | def longest_wait(df):
312 | try:
313 | df = get_time_diff(df)
314 | df1 = df[df["time_diff"] == df["time_diff"].max()]
315 | max_gap = df1["time_diff"].max()
316 | date1 = df1["time"].iloc[0]
317 | date2 = date1 - max_gap
318 | # convert max_gap to int 64
319 | max_gap = int(max_gap.total_seconds())
320 | return {
321 | "gap": int(max_gap) * 1000,
322 | "start_time": int(date2.timestamp() * 1000),
323 | "end_time": int(date1.timestamp() * 1000),
324 | }
325 | except:
326 | return {
327 | "gap": 0,
328 | "start_time": 0,
329 | "end_time": 0,
330 | }
331 |
332 |
333 | def who_texts_first(df):
334 | df = get_time_diff(df)
335 | df1 = df[df["time_diff"] > timedelta(minutes=60)]
336 | send_counts = df1["sender"].value_counts().to_dict()
337 | if len(send_counts) == 0:
338 | return "No one"
339 | max_send_counts = max(send_counts, key=send_counts.get)
340 | return max_send_counts
341 |
342 |
343 | def throwback_chats(chats, n):
344 | df = chats_to_df(chats)
345 | df = df.drop("time", axis=1)
346 | x = df["sender"].size
347 | if x > n:
348 | r = random.randint(0, x - n - 1)
349 | df = df.iloc[r : r + n]
350 | df_json_str = df.to_json(orient="records")
351 | df_json = json.loads(df_json_str)
352 | df_json[-1]["message"] = df_json[-1]["message"][:-1]
353 | return {"throwback_chats": df_json}
354 |
355 |
356 | def words_weight(df):
357 | chat_words = ""
358 | for val in df["message"]:
359 | val = str(val)
360 | tokens = val.split()
361 | for i in range(len(tokens)):
362 | tokens[i] = tokens[i].lower()
363 | chat_words += " ".join(tokens) + " "
364 | chat_words = re.sub(r"http\S+", "", chat_words)
365 | if chat_words.strip() == "":
366 | return "chat unavailable"
367 | return chat_words
368 |
369 |
370 | def word_cloud_words(df):
371 | chat_words = words_weight(df)
372 | words_dict = WordCloud(
373 | stopwords=stopwords,
374 | ).process_text(chat_words)
375 | words_dict = dict(
376 | sorted(words_dict.items(), key=lambda item: item[1], reverse=True)
377 | )
378 | if len(words_dict) > 100:
379 | words_dict = {k: words_dict[k] for k in list(words_dict)[:100]}
380 | max_val = max(words_dict.values())
381 | min_val = min(words_dict.values()) - 1
382 | diff_val = max_val - min_val
383 | return [
384 | {"word": k, "count": v, "weight": ((v - min_val) / diff_val)}
385 | for k, v in words_dict.items()
386 | ]
387 |
388 |
389 | def word_cloud(df):
390 | chat_words = words_weight(df)
391 | # mask_arr = np.array(Image.open("assets/masks/walogo.jpg"))
392 | wordcloud = WordCloud(
393 | font_path="assets/fonts/Poppins-Medium.ttf",
394 | # mask=mask_arr,
395 | min_word_length=2,
396 | width=360,
397 | height=480,
398 | stopwords=stopwords,
399 | min_font_size=12,
400 | colormap="gist_ncar",
401 | )
402 | wc = None
403 | try:
404 | wc = wordcloud.generate(chat_words)
405 | except:
406 | wc = wordcloud.generate("chat unavailable")
407 |
408 | return wc
409 |
410 |
411 | def get_word_cloud(chats):
412 | df = chats_to_df(chats)
413 | return word_cloud_to_base64(df)
414 |
415 |
416 | def word_cloud_to_base64(df):
417 | img = word_cloud(df)
418 | img_bytes = io.BytesIO()
419 | img.to_image().save(img_bytes, format="PNG")
420 | img_bytes = img_bytes.getvalue()
421 | img_base64 = base64.b64encode(img_bytes).decode("utf-8")
422 | return img_base64
423 |
424 |
425 | def most_active_day(df):
426 | df["date"] = pd.DatetimeIndex(df["time"]).date
427 | d_count = df["date"].value_counts()
428 | max_day = d_count.loc[d_count == d_count.max()]
429 | max_day_dict = max_day.to_dict()
430 | max_day_list = [
431 | {
432 | "date": datetime.datetime(
433 | year=x.year, month=x.month, day=x.day, tzinfo=tz.tzutc()
434 | ).timestamp()
435 | * 1000,
436 | "amount": max_day_dict[x],
437 | }
438 | for x in max_day_dict
439 | ][0]
440 | return max_day_list
441 |
442 |
443 | def zscore(amt):
444 | mean = 22000
445 | std = 12000
446 | z = (amt - mean) / std
447 | p = st.norm.cdf(z)
448 | return z, max(min(p, 0.999999), 0.0001)
449 |
450 |
451 | # get median of time difference
452 | def get_median_time_diff(df):
453 | time_df_list = list(df["time_diff"])[1:]
454 | time_df_list = [x.total_seconds() for x in time_df_list]
455 | time_df_list.sort()
456 | if len(time_df_list) == 0:
457 | return 0
458 | return np.median(time_df_list)
459 |
460 |
461 | # get every 10%, 20%, 30%.... 90% of the time difference
462 | def get_time_diff_percentile(df):
463 | time_df_list = list(df["time_diff"])[1:]
464 | time_df_list = [x.total_seconds() for x in time_df_list]
465 | time_df_list.sort()
466 | if len(time_df_list) == 0:
467 | return 0
468 | percentiles = []
469 | for i in range(1, 51):
470 | percentiles.append(np.percentile(time_df_list, i * 2))
471 | return percentiles
472 |
473 |
474 | # get the reponsiveness of the chat
475 | def get_responsiveness(df, percentiles):
476 | # get the first greater than zero percentile
477 | for i in range(len(percentiles)):
478 | if percentiles[i] > 0:
479 | print("Chat responsiveness:\t", (i / 50.0))
480 | return i
481 | return 0
482 |
483 |
484 | def analyze(chats):
485 | df = chats_to_df(chats)
486 | chat_members = members(df)
487 | num_arr = no_of_messages_per_member(df)
488 | words = word_count(df)
489 | month = chats_month(df)
490 |
491 | return {
492 | "members": chat_members,
493 | "no_of_messages": len(df["message"]),
494 | "no_of_messages_per_member": num_arr,
495 | "word_count_per_member": words,
496 | "month_chats_count": month,
497 | }
498 |
499 |
500 | def wrap(chats):
501 | df = getYear2022(chats_to_df(chats))
502 | if df.shape[0] < 75:
503 | print("\nNot enough members or chats to analyze from 2022!\n\n")
504 | return None
505 | print("\n\n---------------------------------------------")
506 | print("Members")
507 | print("---------------------------------------------")
508 | total_chats = len(df["message"])
509 | chat_members = members(df)
510 | num_members = len(chat_members)
511 | if num_members < 2:
512 | return None
513 | num_arr = no_of_messages_per_member(df)
514 | # words = word_count(df)
515 | months, month_corr = chats_month(df)
516 |
517 | # get max month
518 | max_month = months[0]
519 | for m in months:
520 | if m["count"] > max_month["count"]:
521 | max_month = m
522 | hours = chats_hour(df)
523 | max_hour = hours[0]
524 | for h in hours:
525 | if h["count"] > max_hour["count"]:
526 | max_hour = h
527 |
528 | active_day = most_active_day(df)
529 | top_10_emoji = most_used_emoji(df)
530 | # cloud_words = word_cloud_words(df)
531 | z, p = zscore(len(df.index))
532 |
533 | top_percent = 1 - p
534 |
535 | if chat_members:
536 | # print chat members
537 | print(", ".join(chat_members))
538 | else:
539 | "No members found"
540 |
541 | longest_gap = longest_wait(df)
542 |
543 | talk_string = get_chat_date_string(
544 | df, longest_gap["start_time"], longest_gap["end_time"]
545 | )
546 |
547 | total_mins, count_df = get_total_minutes(df)
548 |
549 | print("\n\n\n---------------------------------------------")
550 | print(" Chat Statistics")
551 | print("---------------------------------------------")
552 |
553 | print("Total chats:\t\t " + str(total_chats))
554 | print("Total members:\t " + str(num_members))
555 | print("Total minutes:\t " + str(total_mins))
556 |
557 | top_percent_100 = round(top_percent * 100, 2)
558 | print("Top percentile:\t ", top_percent_100, "%", sep="")
559 |
560 | print("Most active month:\t " + max_month["month"])
561 | print("Month correlation:\t", round(month_corr, 4))
562 |
563 | # convert to 12 hour time
564 | m_hour = max_hour["hour"] % 12
565 | if m_hour == 0:
566 | m_hour = 12
567 | ampm = "AM"
568 | if max_hour["hour"] >= 12:
569 | ampm = "PM"
570 | print(
571 | "Most active hour:\t ",
572 | str(m_hour),
573 | " ",
574 | ampm,
575 | " (",
576 | max_hour["hour"],
577 | ")",
578 | sep="",
579 | )
580 |
581 | print(
582 | "Most active day:\t "
583 | + datetime.datetime.fromtimestamp(active_day["date"] / 1000).strftime(
584 | "%B %d, %Y"
585 | )
586 | )
587 |
588 | # get median of time difference
589 | # median_time_diff = get_median_time_diff(df)
590 |
591 | # get every 10%, 20%, 30%.... 90% of the time difference
592 | time_diff_percentile = get_time_diff_percentile(df)
593 |
594 | # get the reponsiveness of the chat
595 | responsiveness = get_responsiveness(df, time_diff_percentile)
596 |
597 | longest_gap_in_days = int(longest_gap["gap"] / (24 * 60 * 60 * 1000))
598 | longest_session = interesting_search(df, count_df)
599 |
600 | print("Longest gap:\t\t", longest_gap_in_days, "days")
601 | print(
602 | "Longest gap start:\t",
603 | datetime.datetime.fromtimestamp(longest_gap["start_time"] / 1000).strftime(
604 | "%B %d, %Y"
605 | ),
606 | )
607 | print(
608 | "Longest gap end:\t",
609 | datetime.datetime.fromtimestamp(longest_gap["end_time"] / 1000).strftime(
610 | "%B %d, %Y"
611 | ),
612 | )
613 |
614 | return {
615 | "group": len(chat_members) > 2,
616 | "members": chat_members,
617 | # "gender": get_category(chat_members),
618 | "total_no_of_chats": total_chats,
619 | "total_no_of_minutes": total_mins,
620 | "top_percent": top_percent,
621 | # "z_score": z,
622 | "most_active_member": num_arr[0] if len(num_arr) != 0 else "No one",
623 | "no_of_messages_per_member": num_arr,
624 | # "word_count_per_member": words,
625 | # "median_reply_time": (median_time_diff / 60.0),
626 | # "reply_time_percentile": [x / 60.0 for x in time_diff_percentile],
627 | "chat_responsiveness": responsiveness / 50.0,
628 | "most_active_month": max_month,
629 | "month_correlation": month_corr,
630 | "monthly_chats_count": months,
631 | "most_active_hour": max_hour,
632 | "hourly_count": hours,
633 | "most_active_day": active_day,
634 | "longest_session": longest_session,
635 | "longest_gap": longest_gap,
636 | "no_talk_string": talk_string,
637 | "who_texts_first": who_texts_first(df),
638 | # "most_used_emoji": top_10_emoji[0],
639 | "top_10_emojis": top_10_emoji,
640 | # "most_used_word": cloud_words[0],
641 | # "word_cloud_words": cloud_words,
642 | "word_cloud_base64": word_cloud_to_base64(df),
643 | }
644 |
--------------------------------------------------------------------------------