├── .gitignore ├── Procfile ├── README.md ├── assets ├── fonts │ └── Poppins-Medium.ttf ├── masks │ └── walogo.jpg └── stopwords │ └── stop_words.pkl ├── main.py ├── requirements.txt ├── runtime.txt └── src ├── interesting_search.py ├── utils.py └── whatsapp_analyzer.py /.gitignore: -------------------------------------------------------------------------------- 1 | /env 2 | /notebooks 3 | /__pycache__ 4 | /functions/__pycache__ 5 | /.vscode 6 | /src/__pycache__ 7 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: uvicorn main:app --host=0.0.0.0 --port=${PORT:-5000} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WhatsApp Chat Analyzer API 2 | An API to analyse WhatsApp chats and generate insights 3 | 4 | ## API Docs 5 | 6 | https://wa-chat-analyzer.herokuapp.com/docs 7 | 8 | ## Implementation 9 | ![Site Screenshot](https://github.com/anshulagx/OurChatStory-Web/blob/main/public/logo192.png) 10 |
[#WhatsAppWrapped by OurChatStory.co](https://ourchatstory.co/) 11 | 12 | ## Features 13 | 14 | - Total chat count 15 | - Global chat percentile 16 | - Who texts first 17 | - Monthly analysis 18 | - Hourly analysis 19 | - Finding the trend 20 | - Longest gap 21 | - Word Cloud 22 | - Pie Chart for individual and group chats 23 | 24 | ## Contribution 25 | 26 | All communications will be managed via the issues section. 27 | To contribute, fork the repo and create a PR. 28 | 29 | - Put up suggestions and feature request as an issue. 30 | 31 | ## Maintainers 32 | 33 | - [Yajat Malhotra](https://www.github.com/iamyajat) 34 | - [Anshul Agarwala](https://www.github.com/anshulagx) 35 | 36 | -------------------------------------------------------------------------------- /assets/fonts/Poppins-Medium.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamyajat/WhatsApp-Chat-Analyzer-API/a6844fd9b5723b7faa94c24d5f31374126cfdbba/assets/fonts/Poppins-Medium.ttf -------------------------------------------------------------------------------- /assets/masks/walogo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamyajat/WhatsApp-Chat-Analyzer-API/a6844fd9b5723b7faa94c24d5f31374126cfdbba/assets/masks/walogo.jpg -------------------------------------------------------------------------------- /assets/stopwords/stop_words.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/iamyajat/WhatsApp-Chat-Analyzer-API/a6844fd9b5723b7faa94c24d5f31374126cfdbba/assets/stopwords/stop_words.pkl -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from re import split 2 | from fastapi import FastAPI, File, HTTPException, UploadFile 3 | import src.whatsapp_analyzer as wa 4 | import matplotlib.pyplot as plt 5 | from fastapi.middleware.cors import CORSMiddleware 6 | from starlette.exceptions import HTTPException as StarletteHTTPException 7 | from fastapi.responses import PlainTextResponse 8 | from starlette.responses import RedirectResponse 9 | from starlette.middleware.httpsredirect import HTTPSRedirectMiddleware 10 | 11 | import os 12 | 13 | env_name = os.getenv("ENV_NAME", "dev") 14 | 15 | if env_name == "prod": 16 | app = FastAPI( 17 | title="WhatsApp Analyzer", 18 | version="2.0", 19 | description="Get beautiful insights about your chats!", 20 | docs_url=None, 21 | redoc_url=None, 22 | ) 23 | # app.add_middleware(HTTPSRedirectMiddleware) 24 | else: 25 | print("DEV MODE") 26 | app = FastAPI( 27 | title="WhatsApp Analyzer", 28 | version="2.0", 29 | description="Get beautiful insights about your chats!", 30 | ) 31 | print("DOCS:", "http://127.0.0.1:8000/docs") 32 | 33 | 34 | app.add_middleware( 35 | CORSMiddleware, 36 | allow_origins=["http://localhost:3000", "https://ourchatstory.co"], 37 | allow_credentials=True, 38 | allow_methods=["*"], 39 | allow_headers=["*"], 40 | ) 41 | 42 | 43 | # @app.exception_handler(StarletteHTTPException) 44 | # async def http_exception_handler(request, exc): 45 | # response = RedirectResponse(url="https://ourchatstory.co") 46 | # return response 47 | 48 | 49 | if env_name == "dev": 50 | 51 | @app.get("/") 52 | async def root(): 53 | response = RedirectResponse(url="https://ourchatstory.co") 54 | return response 55 | 56 | @app.post("/chats_to_json") 57 | async def chats_to_json(file: UploadFile = File(...)): 58 | """Get your chats in JSON format. (Upload WhatsApp chats as .txt)""" 59 | extension = file.filename.split(".")[-1] in ("txt", "TXT") 60 | if not extension: 61 | raise HTTPException( 62 | status_code=400, detail="Please upload .txt files only!" 63 | ) 64 | contents = await file.read() 65 | decoded_contents = contents.decode("utf-8") 66 | chats = split("\n", decoded_contents) 67 | resp = wa.chats_to_json(chats) 68 | return resp 69 | 70 | @app.post("/analyze") 71 | async def analyze(file: UploadFile = File(...)): 72 | """Get an analysis of your chats. (Upload WhatsApp chats as .txt)""" 73 | extension = file.filename.split(".")[-1] in ("txt", "TXT") 74 | if not extension: 75 | raise HTTPException( 76 | status_code=400, detail="Please upload .txt files only!" 77 | ) 78 | contents = await file.read() 79 | decoded_contents = contents.decode("utf-8") 80 | chats = split("\n", decoded_contents) 81 | resp = wa.analyze(chats) 82 | return resp 83 | 84 | @app.post("/throwback") 85 | async def random(n: int = 10, file: UploadFile = File(...)): 86 | """Get a set of n old chats. (Upload WhatsApp chats as .txt)""" 87 | extension = file.filename.split(".")[-1] in ("txt", "TXT") 88 | if not extension: 89 | raise HTTPException( 90 | status_code=400, detail="Please upload .txt files only!" 91 | ) 92 | contents = await file.read() 93 | decoded_contents = contents.decode("utf-8") 94 | chats = split("\n", decoded_contents) 95 | resp = wa.throwback_chats(chats, n) 96 | return resp 97 | 98 | @app.post("/wordcloud") 99 | async def word_cloud(file: UploadFile = File(...)): 100 | """Get a word cloud""" 101 | extension = file.filename.split(".")[-1] in ("txt", "TXT") 102 | if not extension: 103 | raise HTTPException( 104 | status_code=400, detail="Please upload .txt files only!" 105 | ) 106 | contents = await file.read() 107 | decoded_contents = contents.decode("utf-8") 108 | chats = split("\n", decoded_contents) 109 | img = wa.get_word_cloud(chats) 110 | # buf = io.BytesIO() 111 | # plt.imsave(buf, img, format="PNG") 112 | # buf.seek(0) 113 | # return StreamingResponse( 114 | # buf, 115 | # media_type="image/jpeg", 116 | # headers={ 117 | # "Content-Disposition": 'inline; filename="%s.jpg"' % (file.filename[:-4],) 118 | # }, 119 | # ) 120 | return img 121 | 122 | 123 | @app.post("/wrap") 124 | async def wrap(file: UploadFile = File(...)): 125 | """WhatsApp Wrap 2022""" 126 | file_type = file.filename.split(".")[-1] 127 | extension = file_type in ("txt", "TXT", "zip", "ZIP") 128 | print("\n\n---------------------------------------------") 129 | print(" " + file.filename.split(".")[0]) 130 | print("---------------------------------------------") 131 | if not extension: 132 | raise HTTPException( 133 | status_code=400, detail="Please upload .txt or .zip files only!" 134 | ) 135 | contents = await file.read() 136 | decoded_contents = "" 137 | if file_type == "zip" or file_type == "ZIP": 138 | try: 139 | decoded_contents = wa.extract_zip(contents)["_chat.txt"].decode("utf-8") 140 | except: 141 | raise HTTPException( 142 | status_code=400, detail="Zip file is corrupted! Please try again." 143 | ) 144 | else: 145 | decoded_contents = contents.decode("utf-8") 146 | chats = split("\n", decoded_contents) 147 | resp = wa.wrap(chats) 148 | if resp != None: 149 | return resp 150 | else: 151 | raise HTTPException( 152 | status_code=400, detail="Not enough members or chats to analyze from 2022!" 153 | ) 154 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | emoji==2.2.0 2 | fastapi==0.88.0 3 | matplotlib==3.6.2 4 | numpy==1.23.5 5 | pandas==1.5.2 6 | python_dateutil==2.8.2 7 | requests==2.28.1 8 | scipy==1.9.3 9 | starlette==0.22.0 10 | wordcloud==1.8.2.2 11 | uvicorn==0.20.0 12 | python-multipart==0.0.5 13 | zipfile36==0.1.3 14 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.10.8 -------------------------------------------------------------------------------- /src/interesting_search.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import pandas as pd 4 | import numpy as np 5 | 6 | 7 | def count_per_minute(df): 8 | df["iso_time"] = df["time"].apply(lambda x: str(x)[:-3] + ":00") 9 | df["iso_time"] = pd.to_datetime(df["iso_time"], dayfirst=False) 10 | df["iso_time"] = df["iso_time"].astype(np.int64) 11 | df["iso_time"] = df["iso_time"].apply(lambda x: int(x / 60000000000)) 12 | df = df.copy() 13 | df = df.groupby(["iso_time"]).count() 14 | 15 | df["time"] = df.index 16 | 17 | # drop everything else other than time and chat 18 | df = df[["time", "message"]] 19 | df = df.reset_index(drop=True) 20 | 21 | # rename chat to count 22 | df = df.rename(columns={"message": "count"}) 23 | 24 | return df 25 | 26 | 27 | def interesting_search(original_df, count_df): 28 | # find longest streak 29 | streak = 0 30 | streak_start = 0 31 | streak_end = 0 32 | for i in range(count_df.shape[0] - 1): 33 | if count_df["time"][i + 1] - count_df["time"][i] <= 3: 34 | streak += 1 35 | else: 36 | if streak > streak_end - streak_start: 37 | streak_start = i - streak 38 | streak_end = i 39 | streak = 0 40 | 41 | # stats for streak 42 | longest_streak = streak_end - streak_start 43 | longest_streak_start = count_df["time"][streak_start] * 60000 44 | longest_streak_end = count_df["time"][streak_end] * 60000 45 | total_messages_sent = count_df["count"][streak_start:streak_end].sum() 46 | average_reply_time = (longest_streak_end - longest_streak_start) / ( 47 | total_messages_sent * 1000 48 | ) 49 | 50 | # convert longest streak to datetime 51 | longest_streak_start_dt = datetime.datetime.fromtimestamp( 52 | longest_streak_start / 1000 53 | ).strftime("%B %d, %Y") 54 | longest_streak_end_dt = datetime.datetime.fromtimestamp( 55 | longest_streak_end / 1000 56 | ).strftime("%B %d, %Y") 57 | 58 | # print the stats 59 | print("Longest streak:\t", longest_streak, "minutes") 60 | print("Total messages sent:\t", total_messages_sent, "messages") 61 | print("Longest streak date:\t", longest_streak_start_dt) 62 | # print("Longest streak end: ", longest_streak_end_dt) 63 | print("Average reply time:\t", round(average_reply_time, 2), "seconds") 64 | 65 | # find messages during longest streak 66 | # original_df = original_df[ 67 | # original_df["iso_time"].isin(count_df["time"][streak_start:streak_end]) 68 | # ] 69 | 70 | # odf_json_str = original_df[["time", "sender", "message"]].to_json(orient="records") 71 | # odf_json = json.loads(odf_json_str) 72 | # odf_json[-1]["message"] = odf_json[-1]["message"][:-1] 73 | 74 | # return as dictionary 75 | return { 76 | "streak_duration": longest_streak, 77 | "streak_start": int(longest_streak_start), 78 | "streak_end": int(longest_streak_end), 79 | "total_messages_sent": int(total_messages_sent), 80 | "average_reply_time": float(average_reply_time), 81 | # "messages_during_streak": odf_json, 82 | } 83 | 84 | 85 | def get_total_minutes(df): 86 | count_df = count_per_minute(df) 87 | total_mins = count_df.shape[0] 88 | return total_mins, count_df 89 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # parse a string in different country's dattime formates to return a datetime object 2 | 3 | import datetime 4 | import re 5 | 6 | 7 | def parse_datetime( 8 | s, 9 | dayfirst=True, 10 | ): 11 | # parse a string in different country's dattime formates to return a datetime object 12 | # s: string to be parsed 13 | # return: datetime object 14 | # example: parse_datetime('23/03/22, 5:59 pm') 15 | # example: parse_datetime('23.03.22, 5:59 am') 16 | # example: parse_datetime('23-03-22, 5:59 pm') 17 | # example: parse_datetime('23.03.22, 5.59 pm') 18 | # example: parse_datetime('23.03.22, 5:59') 19 | # example: parse_datetime('23-03-22, 5.59 pm') 20 | # example: parse_datetime('03-23-22, 5:59') 21 | # example: parse_datetime('04/07/20, 15:20:25') 22 | # example: parse_datetime('04/07/20, 5:20:25 pm') 23 | # example: parse_datetime('04/07/20, 5:20:25 am') 24 | 25 | # replace invisible charcaters with space 26 | s = re.sub(r"\s+", " ", s) 27 | 28 | # compile regex pattern to match date and time formats 29 | regex = re.compile( 30 | r"(\d{1,2})[\/\.-](\d{1,2})[\/\.-](\d{2,4})[ ,]*(\d{1,2})[:\.](\d{1,2})[:\.]?(\d{1,2})?[ ]?([ap]m)?" 31 | ) 32 | 33 | # match the string with the regex pattern 34 | match = regex.match(s) 35 | 36 | # if match found 37 | if match: 38 | # extract the date and time from the string 39 | day = int(match.group(1)) 40 | month = int(match.group(2)) 41 | year = int(match.group(3)) 42 | hour = int(match.group(4)) 43 | minute = int(match.group(5)) 44 | second = int(match.group(6)) if match.group(6) else 0 45 | ampm = match.group(7) 46 | 47 | # if year is 2 digit, convert it to 4 digit 48 | if year < 100: 49 | if year <= 22: 50 | year += 2000 51 | else: 52 | year += 1900 53 | 54 | # if ampm is present, convert 12 hour format to 24 hour format 55 | if ampm: 56 | if ampm == "pm" and hour < 12: 57 | hour += 12 58 | elif ampm == "am" and hour == 12: 59 | hour = 0 60 | 61 | # return the datetime object 62 | try: 63 | if dayfirst: 64 | return datetime.datetime(year, month, day, hour, minute, second) 65 | else: 66 | return datetime.datetime(year, day, month, hour, minute, second) 67 | except ValueError: 68 | return None 69 | 70 | # if match not found 71 | else: 72 | # return None 73 | return None 74 | 75 | 76 | # test the function 77 | # print("1.", parse_datetime("23/03/22, 5:59 pm")) 78 | # print("2.", parse_datetime("23.03.22, 5:59 am")) 79 | # print("3.", parse_datetime("23-03-22, 5:59 pm")) 80 | # print("4.", parse_datetime("23.03.22, 5.59 pm")) 81 | # print("5.", parse_datetime("23.03.22, 5:59")) 82 | # print("6.", parse_datetime("23-03-22, 5.59 pm")) 83 | # print("7.", parse_datetime("03-23-22, 5:59", dayfirst=False)) 84 | # print("8.", parse_datetime("04/07/20, 15:20:25")) 85 | # print("9.", parse_datetime("07/23/20, 15:20:25", dayfirst=False)) 86 | # print("10.", parse_datetime("07/23/20, 05:20:25 pm", dayfirst=False)) 87 | # print("11.", parse_datetime("20/03/22, 4:25 pm", dayfirst=True)) 88 | 89 | 90 | # output 91 | # 1. 2022-03-23 17:59:00 92 | # 2. 2022-03-23 05:59:00 93 | # 3. 2022-03-23 17:59:00 94 | # 4. 2022-03-23 17:59:00 95 | # 5. 2022-03-23 05:59:00 96 | # 6. 2022-03-23 17:59:00 97 | # 7. 2022-03-23 05:59:00 98 | # 8. 2020-07-04 15:20:25 99 | # 9. 2020-07-23 15:20:25 100 | # 10. 2020-07-23 17:20:25 101 | # 11. 2022-03-20 16:25:00 102 | 103 | 104 | # from an array of dates find out if the dates are dayfirst or monthfirst 105 | def check_dayfirst(dates): 106 | # example input: ['23/03/22, 5:59 pm', '23.03.22, 5:59 am', '23-03-22, 5:59 pm', '23.03.22, 5.59 pm', '23.03.22, 5:59', '23-03-22, 5.59 pm', '03-23-22, 5:59', '04/07/20, 15:20:25', '04/07/20, 5:20:25 pm', '04/07/20, 5:20:25 am'] 107 | # example output: True 108 | 109 | # count the number of dates that are parsed correctly when dayfirst is True 110 | count_dayfirst = 0 111 | 112 | # count the number of dates that are parsed correctly when dayfirst is False 113 | count_monthfirst = 0 114 | 115 | # loop through all the dates 116 | for date in dates: 117 | # parse the date with dayfirst=True 118 | parsed_date_dayfirst = parse_datetime(date, dayfirst=True) 119 | 120 | # parse the date with dayfirst=False 121 | parsed_date_monthfirst = parse_datetime(date, dayfirst=False) 122 | 123 | # if date is parsed correctly with dayfirst=True, increment count_dayfirst 124 | if parsed_date_dayfirst: 125 | count_dayfirst += 1 126 | 127 | # if date is parsed correctly with dayfirst=False, increment count_monthfirst 128 | if parsed_date_monthfirst: 129 | count_monthfirst += 1 130 | 131 | # if count_dayfirst is less than count_monthfirst, return False 132 | if count_dayfirst < count_monthfirst: 133 | return False 134 | 135 | return True 136 | 137 | 138 | # test the function 139 | # print(check_dayfirst(["11/05/22, 5:59 pm", "12/05/22, 5:30 am", "13/05/22, 5:59 pm"])) 140 | # print(check_dayfirst(["05/11/22, 5:59 pm", "05/12/22, 5:30 am", "05/13/22, 5:59 pm"])) -------------------------------------------------------------------------------- /src/whatsapp_analyzer.py: -------------------------------------------------------------------------------- 1 | from http.client import HTTPException 2 | import numpy as np 3 | import pandas as pd 4 | import re 5 | import json 6 | import datetime 7 | import random 8 | from wordcloud import WordCloud 9 | import emoji 10 | from collections import Counter 11 | from datetime import timedelta 12 | import time 13 | from dateutil import tz 14 | import pickle 15 | import requests 16 | import scipy.stats as st 17 | import base64 18 | import io 19 | from zipfile import ZipFile 20 | from src.interesting_search import get_total_minutes, interesting_search 21 | from src.utils import check_dayfirst, parse_datetime 22 | 23 | with open("./assets/stopwords/stop_words.pkl", "rb") as f: 24 | stopwords = pickle.load(f) 25 | 26 | 27 | def extract_zip(input_zip): 28 | input_zip = ZipFile(io.BytesIO(input_zip)) 29 | return {name: input_zip.read(name) for name in input_zip.namelist()} 30 | 31 | 32 | def time_extractor(x, phone): 33 | y = 0 34 | if phone == "IOS": 35 | y = x.find("] ") 36 | return x[1:y] 37 | else: 38 | y = x.find(" - ") 39 | return x[:y] 40 | 41 | 42 | def chat_extractor(x, phone): 43 | y = 0 44 | if phone == "IOS": 45 | y = x.find("] ") + 2 46 | else: 47 | y = x.find(" - ") + 3 48 | return x[y:] 49 | 50 | 51 | def person_extractor(x): 52 | y = x.find(": ") 53 | if y != -1: 54 | return x[:y] 55 | else: 56 | return np.nan 57 | 58 | 59 | def message_extractor(x): 60 | y = x.find(": ") + 2 61 | s = "" 62 | if (y - 2) != -1: 63 | s = x[y:] 64 | else: 65 | s = x 66 | if ( 67 | s == "" 68 | or s == "This message was deleted" 69 | or s == "You deleted this message" 70 | or s.find("image omitted") != -1 71 | or s.find("video omitted") != -1 72 | or s.find("audio omitted") != -1 73 | or s.find("file omitted") != -1 74 | or s.find("sticker omitted") != -1 75 | or s.find("gif omitted") != -1 76 | or s.find("voice omitted") != -1 77 | or s.find("contact omitted") != -1 78 | or s.find("location omitted") != -1 79 | or s.find("document omitted") != -1 80 | or s.find("") != -1 81 | ): 82 | return np.nan 83 | else: 84 | return s 85 | 86 | 87 | def parse_message(s, phone): 88 | time = time_extractor(s, phone) 89 | person_chat = chat_extractor(s, phone) 90 | person = person_extractor(person_chat) 91 | message = message_extractor(person_chat) 92 | return [time.lower(), person, message] 93 | 94 | 95 | def chats_to_df(chats): 96 | REGEX = { 97 | "IOS": "^[{1}[0-9]+[\/|\–|\-|\.][0-9]+[\/|\–|\-|\.][0-9]+,?\s[0-9]+[:|.][0-9]+[:|.][0-9]+.*$", 98 | "ANDROID": "^[0-9]+/[0-9]+/[0-9]+,?\s[0-9]+[:|.][0-9]+\s.*$", 99 | } 100 | new_chats = [] 101 | phone = "ANDROID" 102 | if chats[0].find(" - ") == -1: 103 | phone = "IOS" 104 | c = 0 105 | i = 0 106 | while i < len(chats): 107 | chats[i] = chats[i].replace("\u200e", "").replace("\r", "") 108 | new_chats.append(chats[i]) 109 | i += 1 110 | while i < len(chats) and not bool(re.search(REGEX[phone], chats[i])): 111 | new_chats[c] += "\n" + chats[i] 112 | i += 1 113 | c += 1 114 | 115 | wa_data = pd.DataFrame(new_chats, columns=["chats"]) 116 | wa_data = wa_data["chats"].apply(parse_message, args=(phone,)) 117 | 118 | wa_data = pd.DataFrame(wa_data.tolist(), columns=["time", "sender", "message"]) 119 | 120 | wa_data.columns = ["time", "sender", "message"] 121 | 122 | dayfirst = check_dayfirst(list(wa_data["time"])) 123 | wa_data["time"] = wa_data["time"].apply(parse_datetime, args=(dayfirst,)) 124 | 125 | return wa_data 126 | 127 | 128 | def members(df): 129 | chat_members = df["sender"].unique() 130 | chat_members = [x for x in chat_members if str(x) != "nan"] 131 | return chat_members 132 | 133 | 134 | def getYear2022(df): 135 | df = df[df["time"].dt.year == 2022] 136 | df.dropna(inplace=True) 137 | df.reset_index(drop=True, inplace=True) 138 | return df 139 | 140 | 141 | def extract_emojis(s): 142 | return "".join(c for c in s if c in emoji.EMOJI_DATA) 143 | 144 | 145 | def chats_to_json(chats): 146 | df = chats_to_df(chats) 147 | df_json_str = df.to_json(orient="records") 148 | df_json = json.loads(df_json_str) 149 | df_json[-1]["message"] = df_json[-1]["message"][:-1] 150 | return {"no_of_messages": len(df_json), "chats": df_json} 151 | 152 | 153 | def no_of_messages_per_member(df): 154 | count = df["sender"].value_counts().to_dict() 155 | count_list = [{"member": x, "count": count[x]} for x in count] 156 | return count_list 157 | 158 | 159 | def word_count(df): 160 | df = df.copy() 161 | df["no_of_words"] = df["message"].apply(lambda x: len(str(x).split())) 162 | df = df.reset_index(drop=True) 163 | members = df["sender"].unique() 164 | word_count = {member: 0 for member in members} 165 | for member in members: 166 | sub_df = df[df["sender"] == member] 167 | word_count[member] = sum(sub_df["no_of_words"]) 168 | series = pd.Series(word_count) 169 | series = series.rename("Word Count") 170 | word_dict = dict( 171 | sorted(series.to_dict().items(), key=lambda item: item[1], reverse=True) 172 | ) 173 | word_list = [{"member": x, "count": word_dict[x]} for x in word_dict] 174 | return word_list 175 | 176 | 177 | def chats_month(df): 178 | df["month"] = pd.DatetimeIndex(df["time"]).month 179 | m_count = df["month"].value_counts().to_dict() 180 | months = [ 181 | "Jan", 182 | "Feb", 183 | "Mar", 184 | "Apr", 185 | "May", 186 | "Jun", 187 | "Jul", 188 | "Aug", 189 | "Sep", 190 | "Oct", 191 | "Nov", 192 | "Dec", 193 | ] 194 | 195 | month_count = [{"month": x, "count": 0} for x in months] 196 | 197 | for mc in m_count: 198 | month_count[mc - 1]["count"] = m_count[mc] 199 | 200 | month_df = pd.DataFrame(month_count) 201 | month_df["month_codes"] = pd.Series(range(1, 13)) 202 | month_corr = month_df["month_codes"].corr(month_df["count"]) 203 | return month_count, month_corr 204 | 205 | 206 | def chats_date(df): 207 | df["date"] = pd.DatetimeIndex(df["time"]).date 208 | 209 | 210 | def check_chat_date(df, date): 211 | return date in df["date"].unique() 212 | 213 | 214 | def convert_long_to_date(long_date): 215 | dt = datetime.datetime.fromtimestamp(long_date / 1000) 216 | date = datetime.date(dt.year, dt.month, dt.day) 217 | return date 218 | 219 | 220 | def get_chat_date_string(df, longest_break_start, longest_break_end): 221 | chats_date(df) 222 | result_chat_date = "" 223 | first_day = False 224 | # loop through all of the days in the year and check if there is a chat on that day 225 | for month in range(1, 13): 226 | for day in range(1, 32): 227 | try: 228 | d = datetime.date(2022, month, day) 229 | except ValueError: 230 | continue 231 | if (not first_day) and check_chat_date(df, d): 232 | first_day = True 233 | 234 | if first_day: 235 | start_gap = convert_long_to_date(longest_break_start) 236 | end_gap = convert_long_to_date(longest_break_end) 237 | if d > start_gap and d < end_gap: 238 | result_chat_date += "2" 239 | continue 240 | if check_chat_date(df, d): 241 | result_chat_date += "0" 242 | else: 243 | result_chat_date += "1" 244 | 245 | else: 246 | result_chat_date += "9" 247 | 248 | return result_chat_date 249 | 250 | 251 | def get_gender(name): 252 | URL = "https://api.genderize.io" 253 | PARAMS = {"name": name} 254 | r = requests.get(url=URL, params=PARAMS) 255 | data = r.json() 256 | return data 257 | 258 | 259 | def get_category(names): 260 | n = [] 261 | for name in names: 262 | x = name.split() 263 | n.append(x[0].lower()) 264 | data = get_gender(n) 265 | genders = [] 266 | gb = {"boy": False, "girl": False} 267 | try: 268 | for d in data: 269 | genders.append(d["gender"]) 270 | if d["gender"] == "male": 271 | gb["boy"] = True 272 | elif d["gender"] == "female": 273 | gb["girl"] = True 274 | except: 275 | print("Gender API calls over") 276 | 277 | return gb 278 | 279 | 280 | def most_used_emoji(df): 281 | emoji_list = df["message"].apply(extract_emojis).tolist() 282 | emoji_str = "".join(emoji_list) 283 | emoji_str = ( 284 | emoji_str.replace("\U0001f3fb", "") 285 | .replace("\U0001f3fc", "") 286 | .replace("\U0001f3fd", "") 287 | .replace("\U0001f3fe", "") 288 | .replace("\U0001f3ff", "") 289 | ) 290 | res = Counter(emoji_str) 291 | top_10 = res.most_common(10) 292 | top_10_list = [{"emoji": x[0], "count": x[1]} for x in top_10] 293 | 294 | return top_10_list 295 | 296 | 297 | def chats_hour(df): 298 | df["hour"] = pd.DatetimeIndex(df["time"]).hour 299 | h_count = df["hour"].value_counts().to_dict() 300 | hour_count = [{"hour": x, "count": 0} for x in range(24)] 301 | for hc in h_count: 302 | hour_count[hc]["count"] = h_count[hc] 303 | return hour_count 304 | 305 | 306 | def get_time_diff(df): 307 | df["time_diff"] = df["time"].diff() 308 | return df 309 | 310 | 311 | def longest_wait(df): 312 | try: 313 | df = get_time_diff(df) 314 | df1 = df[df["time_diff"] == df["time_diff"].max()] 315 | max_gap = df1["time_diff"].max() 316 | date1 = df1["time"].iloc[0] 317 | date2 = date1 - max_gap 318 | # convert max_gap to int 64 319 | max_gap = int(max_gap.total_seconds()) 320 | return { 321 | "gap": int(max_gap) * 1000, 322 | "start_time": int(date2.timestamp() * 1000), 323 | "end_time": int(date1.timestamp() * 1000), 324 | } 325 | except: 326 | return { 327 | "gap": 0, 328 | "start_time": 0, 329 | "end_time": 0, 330 | } 331 | 332 | 333 | def who_texts_first(df): 334 | df = get_time_diff(df) 335 | df1 = df[df["time_diff"] > timedelta(minutes=60)] 336 | send_counts = df1["sender"].value_counts().to_dict() 337 | if len(send_counts) == 0: 338 | return "No one" 339 | max_send_counts = max(send_counts, key=send_counts.get) 340 | return max_send_counts 341 | 342 | 343 | def throwback_chats(chats, n): 344 | df = chats_to_df(chats) 345 | df = df.drop("time", axis=1) 346 | x = df["sender"].size 347 | if x > n: 348 | r = random.randint(0, x - n - 1) 349 | df = df.iloc[r : r + n] 350 | df_json_str = df.to_json(orient="records") 351 | df_json = json.loads(df_json_str) 352 | df_json[-1]["message"] = df_json[-1]["message"][:-1] 353 | return {"throwback_chats": df_json} 354 | 355 | 356 | def words_weight(df): 357 | chat_words = "" 358 | for val in df["message"]: 359 | val = str(val) 360 | tokens = val.split() 361 | for i in range(len(tokens)): 362 | tokens[i] = tokens[i].lower() 363 | chat_words += " ".join(tokens) + " " 364 | chat_words = re.sub(r"http\S+", "", chat_words) 365 | if chat_words.strip() == "": 366 | return "chat unavailable" 367 | return chat_words 368 | 369 | 370 | def word_cloud_words(df): 371 | chat_words = words_weight(df) 372 | words_dict = WordCloud( 373 | stopwords=stopwords, 374 | ).process_text(chat_words) 375 | words_dict = dict( 376 | sorted(words_dict.items(), key=lambda item: item[1], reverse=True) 377 | ) 378 | if len(words_dict) > 100: 379 | words_dict = {k: words_dict[k] for k in list(words_dict)[:100]} 380 | max_val = max(words_dict.values()) 381 | min_val = min(words_dict.values()) - 1 382 | diff_val = max_val - min_val 383 | return [ 384 | {"word": k, "count": v, "weight": ((v - min_val) / diff_val)} 385 | for k, v in words_dict.items() 386 | ] 387 | 388 | 389 | def word_cloud(df): 390 | chat_words = words_weight(df) 391 | # mask_arr = np.array(Image.open("assets/masks/walogo.jpg")) 392 | wordcloud = WordCloud( 393 | font_path="assets/fonts/Poppins-Medium.ttf", 394 | # mask=mask_arr, 395 | min_word_length=2, 396 | width=360, 397 | height=480, 398 | stopwords=stopwords, 399 | min_font_size=12, 400 | colormap="gist_ncar", 401 | ) 402 | wc = None 403 | try: 404 | wc = wordcloud.generate(chat_words) 405 | except: 406 | wc = wordcloud.generate("chat unavailable") 407 | 408 | return wc 409 | 410 | 411 | def get_word_cloud(chats): 412 | df = chats_to_df(chats) 413 | return word_cloud_to_base64(df) 414 | 415 | 416 | def word_cloud_to_base64(df): 417 | img = word_cloud(df) 418 | img_bytes = io.BytesIO() 419 | img.to_image().save(img_bytes, format="PNG") 420 | img_bytes = img_bytes.getvalue() 421 | img_base64 = base64.b64encode(img_bytes).decode("utf-8") 422 | return img_base64 423 | 424 | 425 | def most_active_day(df): 426 | df["date"] = pd.DatetimeIndex(df["time"]).date 427 | d_count = df["date"].value_counts() 428 | max_day = d_count.loc[d_count == d_count.max()] 429 | max_day_dict = max_day.to_dict() 430 | max_day_list = [ 431 | { 432 | "date": datetime.datetime( 433 | year=x.year, month=x.month, day=x.day, tzinfo=tz.tzutc() 434 | ).timestamp() 435 | * 1000, 436 | "amount": max_day_dict[x], 437 | } 438 | for x in max_day_dict 439 | ][0] 440 | return max_day_list 441 | 442 | 443 | def zscore(amt): 444 | mean = 22000 445 | std = 12000 446 | z = (amt - mean) / std 447 | p = st.norm.cdf(z) 448 | return z, max(min(p, 0.999999), 0.0001) 449 | 450 | 451 | # get median of time difference 452 | def get_median_time_diff(df): 453 | time_df_list = list(df["time_diff"])[1:] 454 | time_df_list = [x.total_seconds() for x in time_df_list] 455 | time_df_list.sort() 456 | if len(time_df_list) == 0: 457 | return 0 458 | return np.median(time_df_list) 459 | 460 | 461 | # get every 10%, 20%, 30%.... 90% of the time difference 462 | def get_time_diff_percentile(df): 463 | time_df_list = list(df["time_diff"])[1:] 464 | time_df_list = [x.total_seconds() for x in time_df_list] 465 | time_df_list.sort() 466 | if len(time_df_list) == 0: 467 | return 0 468 | percentiles = [] 469 | for i in range(1, 51): 470 | percentiles.append(np.percentile(time_df_list, i * 2)) 471 | return percentiles 472 | 473 | 474 | # get the reponsiveness of the chat 475 | def get_responsiveness(df, percentiles): 476 | # get the first greater than zero percentile 477 | for i in range(len(percentiles)): 478 | if percentiles[i] > 0: 479 | print("Chat responsiveness:\t", (i / 50.0)) 480 | return i 481 | return 0 482 | 483 | 484 | def analyze(chats): 485 | df = chats_to_df(chats) 486 | chat_members = members(df) 487 | num_arr = no_of_messages_per_member(df) 488 | words = word_count(df) 489 | month = chats_month(df) 490 | 491 | return { 492 | "members": chat_members, 493 | "no_of_messages": len(df["message"]), 494 | "no_of_messages_per_member": num_arr, 495 | "word_count_per_member": words, 496 | "month_chats_count": month, 497 | } 498 | 499 | 500 | def wrap(chats): 501 | df = getYear2022(chats_to_df(chats)) 502 | if df.shape[0] < 75: 503 | print("\nNot enough members or chats to analyze from 2022!\n\n") 504 | return None 505 | print("\n\n---------------------------------------------") 506 | print("Members") 507 | print("---------------------------------------------") 508 | total_chats = len(df["message"]) 509 | chat_members = members(df) 510 | num_members = len(chat_members) 511 | if num_members < 2: 512 | return None 513 | num_arr = no_of_messages_per_member(df) 514 | # words = word_count(df) 515 | months, month_corr = chats_month(df) 516 | 517 | # get max month 518 | max_month = months[0] 519 | for m in months: 520 | if m["count"] > max_month["count"]: 521 | max_month = m 522 | hours = chats_hour(df) 523 | max_hour = hours[0] 524 | for h in hours: 525 | if h["count"] > max_hour["count"]: 526 | max_hour = h 527 | 528 | active_day = most_active_day(df) 529 | top_10_emoji = most_used_emoji(df) 530 | # cloud_words = word_cloud_words(df) 531 | z, p = zscore(len(df.index)) 532 | 533 | top_percent = 1 - p 534 | 535 | if chat_members: 536 | # print chat members 537 | print(", ".join(chat_members)) 538 | else: 539 | "No members found" 540 | 541 | longest_gap = longest_wait(df) 542 | 543 | talk_string = get_chat_date_string( 544 | df, longest_gap["start_time"], longest_gap["end_time"] 545 | ) 546 | 547 | total_mins, count_df = get_total_minutes(df) 548 | 549 | print("\n\n\n---------------------------------------------") 550 | print(" Chat Statistics") 551 | print("---------------------------------------------") 552 | 553 | print("Total chats:\t\t " + str(total_chats)) 554 | print("Total members:\t " + str(num_members)) 555 | print("Total minutes:\t " + str(total_mins)) 556 | 557 | top_percent_100 = round(top_percent * 100, 2) 558 | print("Top percentile:\t ", top_percent_100, "%", sep="") 559 | 560 | print("Most active month:\t " + max_month["month"]) 561 | print("Month correlation:\t", round(month_corr, 4)) 562 | 563 | # convert to 12 hour time 564 | m_hour = max_hour["hour"] % 12 565 | if m_hour == 0: 566 | m_hour = 12 567 | ampm = "AM" 568 | if max_hour["hour"] >= 12: 569 | ampm = "PM" 570 | print( 571 | "Most active hour:\t ", 572 | str(m_hour), 573 | " ", 574 | ampm, 575 | " (", 576 | max_hour["hour"], 577 | ")", 578 | sep="", 579 | ) 580 | 581 | print( 582 | "Most active day:\t " 583 | + datetime.datetime.fromtimestamp(active_day["date"] / 1000).strftime( 584 | "%B %d, %Y" 585 | ) 586 | ) 587 | 588 | # get median of time difference 589 | # median_time_diff = get_median_time_diff(df) 590 | 591 | # get every 10%, 20%, 30%.... 90% of the time difference 592 | time_diff_percentile = get_time_diff_percentile(df) 593 | 594 | # get the reponsiveness of the chat 595 | responsiveness = get_responsiveness(df, time_diff_percentile) 596 | 597 | longest_gap_in_days = int(longest_gap["gap"] / (24 * 60 * 60 * 1000)) 598 | longest_session = interesting_search(df, count_df) 599 | 600 | print("Longest gap:\t\t", longest_gap_in_days, "days") 601 | print( 602 | "Longest gap start:\t", 603 | datetime.datetime.fromtimestamp(longest_gap["start_time"] / 1000).strftime( 604 | "%B %d, %Y" 605 | ), 606 | ) 607 | print( 608 | "Longest gap end:\t", 609 | datetime.datetime.fromtimestamp(longest_gap["end_time"] / 1000).strftime( 610 | "%B %d, %Y" 611 | ), 612 | ) 613 | 614 | return { 615 | "group": len(chat_members) > 2, 616 | "members": chat_members, 617 | # "gender": get_category(chat_members), 618 | "total_no_of_chats": total_chats, 619 | "total_no_of_minutes": total_mins, 620 | "top_percent": top_percent, 621 | # "z_score": z, 622 | "most_active_member": num_arr[0] if len(num_arr) != 0 else "No one", 623 | "no_of_messages_per_member": num_arr, 624 | # "word_count_per_member": words, 625 | # "median_reply_time": (median_time_diff / 60.0), 626 | # "reply_time_percentile": [x / 60.0 for x in time_diff_percentile], 627 | "chat_responsiveness": responsiveness / 50.0, 628 | "most_active_month": max_month, 629 | "month_correlation": month_corr, 630 | "monthly_chats_count": months, 631 | "most_active_hour": max_hour, 632 | "hourly_count": hours, 633 | "most_active_day": active_day, 634 | "longest_session": longest_session, 635 | "longest_gap": longest_gap, 636 | "no_talk_string": talk_string, 637 | "who_texts_first": who_texts_first(df), 638 | # "most_used_emoji": top_10_emoji[0], 639 | "top_10_emojis": top_10_emoji, 640 | # "most_used_word": cloud_words[0], 641 | # "word_cloud_words": cloud_words, 642 | "word_cloud_base64": word_cloud_to_base64(df), 643 | } 644 | --------------------------------------------------------------------------------