├── message_analyser ├── retriever │ ├── __init__.py │ ├── vkOpt.py │ └── telegram.py ├── __init__.py ├── misc.py ├── myMessage.py ├── storage.py ├── analyser.py ├── structure_tools.py ├── GUI.py └── plotter.py ├── .gitignore ├── examples ├── sample one │ ├── heat_map.png │ ├── wordcloud.png │ ├── barplot_emojis.png │ ├── lineplot_messages.png │ ├── barplot_messages_per_day.png │ ├── barplot_non_text_messages.png │ ├── distplot_messages_per_day.png │ ├── lineplot_message_length.png │ ├── pie_messages_per_author.png │ ├── distplot_messages_per_hour.png │ ├── distplot_messages_per_month.png │ ├── barplot_messages_per_minutes.png │ ├── barplot_messages_per_weekday.png │ └── stackplot_non_text_messages_percentage.png └── sample two │ ├── heat_map.png │ ├── barplot_emojis.png │ ├── lineplot_messages.png │ ├── barplot_messages_per_day.png │ ├── barplot_non_text_messages.png │ ├── distplot_messages_per_day.png │ ├── lineplot_message_length.png │ ├── pie_messages_per_author.png │ ├── distplot_messages_per_hour.png │ ├── distplot_messages_per_month.png │ ├── barplot_messages_per_minutes.png │ ├── barplot_messages_per_weekday.png │ └── stackplot_non_text_messages_percentage.png ├── requirements.txt ├── config.ini ├── config.example.ini ├── main.py ├── LICENSE └── README.md /message_analyser/retriever/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__/ 3 | *.session 4 | *.session-journal -------------------------------------------------------------------------------- /examples/sample one/heat_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/heat_map.png -------------------------------------------------------------------------------- /examples/sample one/wordcloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/wordcloud.png -------------------------------------------------------------------------------- /examples/sample two/heat_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/heat_map.png -------------------------------------------------------------------------------- /examples/sample one/barplot_emojis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/barplot_emojis.png -------------------------------------------------------------------------------- /examples/sample two/barplot_emojis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/barplot_emojis.png -------------------------------------------------------------------------------- /examples/sample one/lineplot_messages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/lineplot_messages.png -------------------------------------------------------------------------------- /examples/sample two/lineplot_messages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/lineplot_messages.png -------------------------------------------------------------------------------- /examples/sample one/barplot_messages_per_day.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/barplot_messages_per_day.png -------------------------------------------------------------------------------- /examples/sample one/barplot_non_text_messages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/barplot_non_text_messages.png -------------------------------------------------------------------------------- /examples/sample one/distplot_messages_per_day.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/distplot_messages_per_day.png -------------------------------------------------------------------------------- /examples/sample one/lineplot_message_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/lineplot_message_length.png -------------------------------------------------------------------------------- /examples/sample one/pie_messages_per_author.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/pie_messages_per_author.png -------------------------------------------------------------------------------- /examples/sample two/barplot_messages_per_day.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/barplot_messages_per_day.png -------------------------------------------------------------------------------- /examples/sample two/barplot_non_text_messages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/barplot_non_text_messages.png -------------------------------------------------------------------------------- /examples/sample two/distplot_messages_per_day.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/distplot_messages_per_day.png -------------------------------------------------------------------------------- /examples/sample two/lineplot_message_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/lineplot_message_length.png -------------------------------------------------------------------------------- /examples/sample two/pie_messages_per_author.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/pie_messages_per_author.png -------------------------------------------------------------------------------- /examples/sample one/distplot_messages_per_hour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/distplot_messages_per_hour.png -------------------------------------------------------------------------------- /examples/sample one/distplot_messages_per_month.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/distplot_messages_per_month.png -------------------------------------------------------------------------------- /examples/sample two/distplot_messages_per_hour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/distplot_messages_per_hour.png -------------------------------------------------------------------------------- /examples/sample two/distplot_messages_per_month.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/distplot_messages_per_month.png -------------------------------------------------------------------------------- /examples/sample one/barplot_messages_per_minutes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/barplot_messages_per_minutes.png -------------------------------------------------------------------------------- /examples/sample one/barplot_messages_per_weekday.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/barplot_messages_per_weekday.png -------------------------------------------------------------------------------- /examples/sample two/barplot_messages_per_minutes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/barplot_messages_per_minutes.png -------------------------------------------------------------------------------- /examples/sample two/barplot_messages_per_weekday.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/barplot_messages_per_weekday.png -------------------------------------------------------------------------------- /examples/sample one/stackplot_non_text_messages_percentage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample one/stackplot_non_text_messages_percentage.png -------------------------------------------------------------------------------- /examples/sample two/stackplot_non_text_messages_percentage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vlajnaya-mol/message-analyser/HEAD/examples/sample two/stackplot_non_text_messages_percentage.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-dateutil==2.8.0 2 | 3 | telethon==1.5.5 4 | 5 | numpy==1.16.1 6 | 7 | pandas==0.24.1 8 | 9 | matplotlib==3.0.2 10 | 11 | seaborn==0.9.0 12 | 13 | wordcloud==1.5.0 14 | 15 | emoji==0.5.1 -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [telegram_secrets] 2 | api_id = *Your API ID* 3 | api_hash = *Your API HASH* 4 | phone_number = *Your phone* 5 | session_name = message analyser 6 | 7 | [session_params] 8 | dialog_id = 9 | vkopt_file = 10 | words_file = 11 | your_name = 12 | target_name = 13 | 14 | -------------------------------------------------------------------------------- /config.example.ini: -------------------------------------------------------------------------------- 1 | [telegram_secrets] 2 | api_id = 123456 3 | api_hash = d49161cade9d408c804a5d58b6ec0aef 4 | phone_number = +79123456789 5 | session_name = username1 6 | 7 | [session_params] 8 | dialog_id = 123456789 9 | vkopt_file = 10 | words_file = 11 | your_name = username1 12 | target_name = username2 13 | 14 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from message_analyser.GUI import start_gui 3 | 4 | 5 | if __name__ == "__main__": 6 | aio_loop = asyncio.get_event_loop() 7 | try: 8 | aio_loop.run_until_complete(start_gui(aio_loop)) 9 | finally: 10 | if not aio_loop.is_closed(): 11 | aio_loop.close() -------------------------------------------------------------------------------- /message_analyser/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger("message_analyser") 4 | logger.setLevel(logging.INFO) 5 | ch = logging.StreamHandler() 6 | ch.setLevel(logging.INFO) 7 | formatter = logging.Formatter("%(asctime)s - %(message)s") 8 | ch.setFormatter(formatter) 9 | logger.addHandler(ch) 10 | -------------------------------------------------------------------------------- /message_analyser/misc.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | 4 | delay = 0.05 5 | months_border = 2 # if the conversation is shorter than this values than xticks will be weeks, not months. 6 | 7 | 8 | def avg(l): 9 | if not l: 10 | return 0 11 | return sum(l) / len(l) 12 | 13 | 14 | def log_line(*args): 15 | logging.getLogger("message_analyser").log(logging.INFO, ' '.join([str(arg) for arg in args]) + '\n') 16 | 17 | 18 | def time_offset(date): 19 | return (time.timezone if (time.localtime(int(time.mktime(date.timetuple()))).tm_isdst == 0) 20 | else time.altzone) / 60 / 60 * -1 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Oleg Borovik 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # message-analyser 2 | Statistical analysis of VKontakte and Telegram message history. 3 | ![front example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/heat_map.png) 4 | 5 | ### Dependencies 6 | * [Telethon](https://github.com/LonamiWebs/Telethon) 7 | * [seaborn](https://github.com/mwaskom/seaborn) 8 | * [wordcloud](https://github.com/amueller/word_cloud) 9 | 10 | ### Installation 11 | * Use Python3.6. 3.7 version may not work properly. 12 | * `git clone https://github.com/vlajnaya-mol/message-analyser` 13 | * Install `requirements.txt`. (`pip install -r /path/to/requirements.txt`) 14 | 15 | ### Usage 16 | #### Execution 17 | * Run `python main.py` 18 | * Follow GUI commands 19 | 20 | #### Telegram messages 21 | * You need API Hash and API ID from [here](https://core.telegram.org/api/obtaining_api_id) 22 | * Be sure to use proxy if Telegram is blocked in your country. 23 | 24 | #### VKontakte messages 25 | * Install [VkOpt extension](http://vkopt.net/) 26 | * Save Your conversation as .txt file using this extension 27 | 28 | Be sure You used **default** format settings: 29 | 30 | ``` 31 | %username% (%date%): 32 | %message% 33 | 34 | HH:MM:ss dd/mm/yyyy 35 | ``` 36 | * You can concatenate two VkOpt files and use as one 37 | * Include this file in the analysis process 38 | 39 | #### Words 40 | * Write words You are interested in to a file 41 | * Be sure words are saved correctly. Cyrillic words are ruined by saving in ASCII format. 42 | * Include this file in the analysis process 43 | 44 | #### Manual analysis 45 | * Fill `config.ini` file and use `retrieve_and_analyse(loop)` instead of using GUI. 46 | * Use `analyse_from_file(path)` function instead of redownloading messages 47 | 48 | ### Examples 49 | * All examples can be found [here](examples/) 50 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/barplot_messages_per_day.png) 51 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/barplot_messages_per_minutes.png) 52 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/barplot_messages_per_weekday.png) 53 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/barplot_non_text_messages.png) 54 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/distplot_messages_per_day.png) 55 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/lineplot_message_length.png) 56 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/lineplot_messages.png) 57 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/pie_messages_per_author.png) 58 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/stackplot_non_text_messages_percentage.png) 59 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/barplot_emojis.png) 60 | ![other example](https://github.com/vlajnaya-mol/message-analyser/blob/master/examples/sample%20one/wordcloud.png) 61 | 62 | ### Potential project improvements 63 | - analysis of group chats. 64 | - improve tkinter theme. 65 | - add VkOpt stickers as emojis to messages. 66 | - add plot correlation between the number of voice messages and the average message length. 67 | - add "first-to-write" and "response time (delay)" plots (lineplot). 68 | - add n-grams plot (lineplot). 69 | -------------------------------------------------------------------------------- /message_analyser/myMessage.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | 4 | 5 | def islink(string): 6 | # https://stackoverflow.com/a/7160778 7 | regex = re.compile( 8 | r'^(?:http|ftp)s?://' # http:// or https:// 9 | r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... 10 | r'localhost|' # localhost... 11 | r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip 12 | r'(?::\d+)?' # optional port 13 | r'(?:/?|[/?]\S+)$', re.IGNORECASE) 14 | return re.match(regex, string) is not None 15 | 16 | 17 | class MyMessage(dict): 18 | """Represents a message entity from some messenger. 19 | 20 | Attributes: 21 | See __init__ args. 22 | """ 23 | 24 | def __init__(self, text, date, author, 25 | is_forwarded=False, 26 | document_id=None, 27 | has_photo=False, 28 | has_voice=False, 29 | has_audio=False, 30 | has_video=False, 31 | has_sticker=False, 32 | is_link=None): 33 | """Inits MyMessage class with all it's attributes values. 34 | 35 | Notes: 36 | How to check a message for being a file: 37 | self.has_document = document_id is not None 38 | self.is_file = text == "" and (self.has_photo or self.has_document) 39 | # Because sometimes a photo is not considered as a document. 40 | 41 | Args: 42 | text (str): A raw content of the message. 43 | date (string ("%Y-%m-%d %H:%M:%S") date or datetime obj): A time when this message was sent. 44 | author (str): Author's name. 45 | is_forwarded (bool): True if the message is forwarded from another person. 46 | document_id (int): Integer id of the document (-1 for vkOpt messages, None for no document). 47 | has_photo (bool): True if the message has photo. 48 | has_voice (bool): True if the message has voice-message file attached. 49 | has_audio (bool): True if the message has audio file attached (NOT voice!). 50 | has_video (bool): True if the message has video-message file attached (not just a VIDEO!). 51 | has_sticker (bool): True if the message has sticker. 52 | is_link (bool): True if the whole text of the message is a link. 53 | """ 54 | super().__init__() 55 | attributes = {"text": text, 56 | "date": date, 57 | "author": author, 58 | "is_forwarded": is_forwarded, 59 | "document_id": document_id, 60 | "has_photo": has_photo, 61 | "has_voice": has_voice, 62 | "has_audio": has_audio, 63 | "has_video": has_video, 64 | "has_sticker": has_sticker, 65 | "is_link": is_link 66 | } 67 | if not isinstance(date, datetime): 68 | attributes["date"] = datetime.strptime(str(date), "%Y-%m-%d %H:%M:%S") 69 | if is_link is None: 70 | attributes["is_link"] = islink(text) 71 | self.update(attributes) 72 | 73 | def __str__(self): 74 | return (f"Author = {self.author}\n" 75 | f"Content = [{self.text[:100] + '[...]' if len(self.text) > 100 else self.text}]\n" 76 | f"Date = {self.date}\n" 77 | f"Contains document = {self.document_id is not None}\n" 78 | f"Has photo = {self.has_photo}\n" 79 | f"Is link = {self.is_link}\n" 80 | f"Is forwarded = {self.is_forwarded}\n") 81 | 82 | def __repr__(self): 83 | return str(self) 84 | 85 | def __getattr__(self, attr): 86 | return self[attr] 87 | 88 | def __setattr__(self, key, value): 89 | if key in self: 90 | raise Exception("Can't mutate an Immutable: self.%s = %r" % (key, value)) 91 | self[key] = value 92 | 93 | @staticmethod 94 | def from_dict(d): 95 | return MyMessage(**d) 96 | -------------------------------------------------------------------------------- /message_analyser/storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import configparser 5 | from message_analyser.myMessage import MyMessage 6 | from message_analyser.misc import log_line 7 | 8 | 9 | def _get_config_file_name(): 10 | return os.path.join(os.path.split(os.path.normpath(os.path.dirname(__file__)))[0], "config.ini") 11 | 12 | 13 | def store_session_params(params): 14 | config_file_name = _get_config_file_name() 15 | config_parser = configparser.ConfigParser() 16 | config_parser.read(config_file_name, encoding="utf-8-sig") 17 | assert params["from_vk"] or params["from_telegram"] 18 | config_parser.set("session_params", "dialog_id", 19 | re.compile("\(id=[0-9]+\)$").search(params["dialogue"]).group()[4:-1] if params["from_telegram"] 20 | else "") 21 | config_parser.set("session_params", "vkopt_file", params["vkopt_file"] if params["from_vk"] else "") 22 | config_parser.set("session_params", "words_file", params["words_file"] if params["plot_words"] else "") 23 | 24 | assert params["your_name"] and params["target_name"] 25 | config_parser.set("session_params", "your_name", params["your_name"]) 26 | config_parser.set("session_params", "target_name", params["target_name"]) 27 | with open(config_file_name, "w+", encoding="utf-8") as config_file: 28 | config_parser.write(config_file) 29 | log_line(f"Session parameters were stored in {config_file_name} file.") 30 | 31 | 32 | def get_session_params(): 33 | config_file_name = _get_config_file_name() 34 | config_parser = configparser.ConfigParser() 35 | config_parser.read(config_file_name, encoding="utf-8-sig") 36 | dialog_id = config_parser.get("session_params", "dialog_id", fallback="") 37 | dialog_id = int(dialog_id) if dialog_id else -1 38 | vkopt_file = config_parser.get("session_params", "vkopt_file", fallback="") 39 | words_file = config_parser.get("session_params", "words_file", fallback="") 40 | your_name = config_parser.get("session_params", "your_name", fallback="") 41 | target_name = config_parser.get("session_params", "target_name", fallback="") 42 | log_line(f"Session parameters were received from {config_file_name} file.") 43 | return dialog_id, vkopt_file, words_file, your_name, target_name 44 | 45 | 46 | def store_telegram_secrets(api_id, api_hash, phone_number, session_name="Message retriever"): 47 | config_file_name = _get_config_file_name() 48 | config_parser = configparser.ConfigParser() 49 | config_parser.read(config_file_name, encoding="utf-8-sig") 50 | config_parser.set("telegram_secrets", "api_id", api_id) 51 | config_parser.set("telegram_secrets", "api_hash", api_hash) 52 | config_parser.set("telegram_secrets", "session_name", session_name) 53 | config_parser.set("telegram_secrets", "phone_number", phone_number) 54 | with open(config_file_name, "w+", encoding="utf-8") as config_file: 55 | config_parser.write(config_file) 56 | log_line(f"Telegram secrets were stored in {config_file_name} file.") 57 | 58 | 59 | def get_telegram_secrets(): 60 | config_file_name = _get_config_file_name() 61 | config_parser = configparser.ConfigParser() 62 | config_parser.read(config_file_name, encoding="utf-8-sig") 63 | api_id = config_parser.get("telegram_secrets", "api_id", fallback="") 64 | api_hash = config_parser.get("telegram_secrets", "api_hash", fallback="") 65 | phone_number = config_parser.get("telegram_secrets", "phone_number", fallback="") 66 | session_name = config_parser.get("telegram_secrets", "session_name", fallback="") 67 | log_line(f"Telegram secrets were received from {config_file_name} file.") 68 | return api_id, api_hash, phone_number, session_name 69 | 70 | 71 | def store_msgs(file_path, msgs): 72 | with open(file_path, 'w') as fp: 73 | json.dump(msgs, fp, default=str) 74 | log_line(f"{len(msgs)} messages were stored in {file_path} file.") 75 | 76 | 77 | def store_top_words_count(words, your_words_cnt, target_words_cnt, file_path): 78 | with open(file_path, 'w', encoding="utf-8") as fp: 79 | fp.write("Word, You sent, Target sent, Total\n") 80 | for word in words: 81 | fp.write(f"{word}, {your_words_cnt[word]}, {target_words_cnt[word]}, " 82 | f"{your_words_cnt[word]+target_words_cnt[word]}\n") 83 | 84 | def get_msgs(file_path): 85 | with open(file_path, 'r') as f: 86 | msgs = [MyMessage.from_dict(msg) for msg in json.loads(f.read())] 87 | log_line(f"{len(msgs)} messages were received from {file_path} file.") 88 | return msgs 89 | 90 | 91 | def get_words(file_path): 92 | with open(file_path, 'r', encoding="utf-8-sig") as f: 93 | words = [word.strip() for word in f.readlines() 94 | if all([ch.isalpha() or ch == '\'' or ch == '`' for ch in word.strip()])] 95 | log_line(f"{len(words)} words were received from {file_path} file.") 96 | return words 97 | -------------------------------------------------------------------------------- /message_analyser/retriever/vkOpt.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime 3 | from message_analyser.myMessage import MyMessage 4 | from message_analyser.misc import log_line 5 | 6 | 7 | def get_mymessages_from_file(your_name, target_name, opt_file_name): 8 | """Retrieves a list of MyMessage representations of messages from a file generated by VkOpt GChrome extension. 9 | 10 | Notes: 11 | You must firstly ensure that your_name and target_name are equal to the names in opt_file_name text file. 12 | 13 | Args: 14 | your_name (str): Your name. 15 | target_name (str): Target's name. 16 | opt_file_name (str): The name of the file to read. 17 | 18 | Returns: 19 | A list of MyMessage objects. 20 | """ 21 | log_line("Start reading vkOpt messages") 22 | with open(opt_file_name, 'r', encoding="utf8") as f: 23 | lines = f.readlines() 24 | opt_message_list = _parse_lines(lines, your_name, target_name) 25 | msgs = [_opt_to_mymessage(msg) for msg in opt_message_list] 26 | log_line(len(opt_message_list), " vkOpt messages were received.") 27 | return msgs 28 | 29 | 30 | def _opt_to_mymessage(msg): 31 | """Transforms dictionary representation of the VkOpt message to the MeMessage obj. 32 | 33 | Notes: 34 | Document id of a VkOpt message isn't parsed and may only be -1. 35 | Photos aren't documents (for some reason). 36 | Message is forwarded if only it has attached forwarded messages and doesn't contain any text. 37 | 38 | Args: 39 | msg (dict): Representation of a VkOpt message. 40 | 41 | Returns: 42 | MyMessage representation of vkOpt message 43 | """ 44 | return MyMessage(text=msg["text"], date=msg["date"], 45 | author=msg["author"], 46 | has_sticker=msg["attachment"].startswith("{\"type\":\"sticker\""), 47 | is_forwarded=msg["has_forwards"] and not msg["text"], 48 | document_id=-1 if msg["attachment"] and not msg["attachment"].startswith( 49 | "vk.com/photo") else None, 50 | has_photo=msg["attachment"].startswith("vk.com/photo"), 51 | has_voice=("audio_msg.opus" in msg["attachment"]) or 52 | ("voice_message.webm" in msg["attachment"]) or 53 | ("audiocomment.3gp" in msg["attachment"]), 54 | has_audio=msg["attachment"].startswith("vk.com/audio"), 55 | has_video=False, # msg["attachment"].startswith("vk.com/video"), 56 | # vk.com doesn't have video messages in the way Telegram does. 57 | is_link=True if (msg["attachment"].startswith("{\"type\":\"wall\"") or 58 | msg["attachment"].startswith("{\"type\":\"link\"")) else None) 59 | 60 | 61 | def _parse_lines(lines, your_name, target_name, num=1000000): 62 | """Parses given text lines and retrieves a message list. 63 | 64 | Notes: 65 | Parses messages from vkOpt GChrome extension with a DEFAULT message format. 66 | Appropriate message format is "%username% (%date%): 67 | %message%" 68 | Appropriate datetime format is "HH:MM:ss dd/mm/yyyy". 69 | More than one nested forwarded messages are counted as ONE forwarded message. 70 | ... As well as a message with multiple photos counts as ONE photo. 71 | ... As well as a message with multiple audio files ... what the heck? 72 | 73 | Args: 74 | your_name (str): Your name. 75 | target_name (str): Target's name. 76 | lines (list of strings): Text lines of the file. 77 | num (int): Max number of the messages to retrieve. 78 | 79 | Returns: 80 | List of dictionaries such as: 81 | { 82 | "text": text of the message (str), 83 | "has_forwards": flag (bool), 84 | "attachment": text (str) of the attachment (without first line) 85 | } 86 | """ 87 | lines[0] = lines[0].replace('\ufeff', '') # remove start character 88 | # assert lines[0].startswith(target_name) or lines[0].strip().startswith(your_name) 89 | date_pattern = "[0-2][0-9]:[0-5][0-9]:[0-5][0-9] [0-3][0-9]/[0-1][0-9]/([0-9]{4})" 90 | date_regex = re.compile(date_pattern) 91 | title_ending_regex = re.compile(" \(" + date_pattern + "\):\n$") 92 | msg_title_regex = re.compile("^\t*(" + your_name + '|' + target_name + ") \(" + date_pattern + "\):\n$") 93 | msgs = [] 94 | current_msg = {"text": "", "has_forwards": False, "attachment": ""} 95 | i = 0 96 | while i < len(lines) and len(msgs) <= num: 97 | line = lines[i] 98 | if line.startswith("Attachments:["): 99 | i += 1 100 | current_msg["attachment"] = lines[i] 101 | else: 102 | search = title_ending_regex.search(line) 103 | if search is not None and search.span()[1] == len(line): 104 | if line[0].isspace(): 105 | current_msg["has_forwards"] = True 106 | i += 1 107 | else: 108 | if not msg_title_regex.match(line): 109 | log_line(f"[{line}] DOES NOT MATCH ANY SUGGESTED NAME! NO VK OPT MESSAGES WILL BE RECEIVED!") 110 | return [] 111 | # removing redundant spaces after the message 112 | current_msg["text"] = current_msg["text"][:-3] 113 | msgs.append(current_msg) 114 | current_msg = {"text": "", "has_forwards": False, "attachment": ""} 115 | current_msg["date"] = datetime.strptime(date_regex.search(line).group(), "%H:%M:%S %d/%m/%Y") 116 | current_msg["author"] = your_name if line.startswith(your_name) else target_name 117 | elif not current_msg["has_forwards"]: 118 | if current_msg["attachment"]: 119 | current_msg["attachment"] += line 120 | else: 121 | current_msg["text"] += line 122 | i += 1 123 | if i > 0: 124 | current_msg["text"] = current_msg["text"] if current_msg["attachment"] else current_msg["text"][:-3] 125 | msgs.append(current_msg) 126 | # first message is just a template and should be removed 127 | return msgs[1:] 128 | -------------------------------------------------------------------------------- /message_analyser/retriever/telegram.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import message_analyser.storage as storage 3 | from dateutil.relativedelta import relativedelta 4 | from telethon import TelegramClient # , sync 5 | from telethon.tl.types import Message 6 | from telethon.errors.rpcerrorlist import ApiIdInvalidError, PhoneNumberInvalidError, PhoneCodeInvalidError, \ 7 | SessionPasswordNeededError, PasswordHashInvalidError, FloodWaitError 8 | from message_analyser.myMessage import MyMessage 9 | from message_analyser.misc import log_line, time_offset 10 | 11 | 12 | async def get_str_dialogs(client=None, loop=None): 13 | """Retrieves a list with all user-dialogs of a current client. 14 | 15 | Args: 16 | client (TelegramClient object, optional): A client. 17 | loop (asyncio.windows_events._WindowsSelectorEventLoop, optional): An event loop. 18 | 19 | Returns: 20 | A list of strings. An example: 21 | 22 | ["Alex (id=00001)", "Kate (id=99990)"] 23 | 24 | Where Alex and Kate are names, 00001 and 99990 are IDs of their dialogs. 25 | """ 26 | return [f"{dialog.name} (id={dialog.id})" for dialog in await _get_dialogs(client, loop)] 27 | 28 | 29 | async def get_sign_in_results(api_id, api_hash, code, phone_number, password, session_name, loop=None): 30 | """Tries to sign-in in Telegram with given parameters. 31 | 32 | Notes: Automatically creates .session file for further sign-ins. 33 | 34 | Args: 35 | api_id (str/int): Telegram API id. 36 | api_hash (str): Telegram API hash. 37 | code (str/int): A confirmation code. 38 | phone_number (str): A phone number connected to such id/hash pair. 39 | password (str): 2FA password (if needed). 40 | session_name (str): A name of the current session. 41 | loop (asyncio.windows_events._WindowsSelectorEventLoop, optional): An event loop. 42 | 43 | Returns: 44 | A string describing the results of sign-in. 45 | """ 46 | try: 47 | client = TelegramClient(session_name, api_id, api_hash, loop=loop) 48 | await client.connect() 49 | except (ApiIdInvalidError, ValueError): 50 | log_line("Unsuccessful sign-in! Wrong API.") 51 | return "wrong api" 52 | except OSError: 53 | log_line("No Internet connection.") 54 | return "no internet" 55 | try: 56 | if not await client.is_user_authorized(): 57 | await client.send_code_request(phone_number) 58 | try: 59 | await client.sign_in(phone_number, code) 60 | except SessionPasswordNeededError: 61 | await client.sign_in(phone_number, password=password) 62 | if not await client.is_user_authorized(): 63 | raise PhoneCodeInvalidError(request=None) 64 | except ApiIdInvalidError: 65 | log_line("Unsuccessful sign-in! Wrong API.") 66 | return "wrong api" 67 | except PhoneCodeInvalidError: 68 | log_line("Unsuccessful sign-in! Need code.") 69 | return "need code" 70 | except PasswordHashInvalidError: 71 | log_line("Unsuccessful sign-in! Need password.") 72 | return "need password" 73 | except (PhoneNumberInvalidError, TypeError): 74 | log_line("Unsuccessful sign-in! Need phone.") 75 | return "need phone" 76 | except FloodWaitError as err: 77 | log_line(f'Unsuccessful sign-in! {err.message}') 78 | return f'need wait for {err.seconds}' 79 | finally: 80 | if client.is_connected(): 81 | await client.disconnect() 82 | log_line("Successful sign-in.") 83 | return "success" 84 | 85 | 86 | async def get_telegram_messages(your_name, target_name, loop=None, target_id=None, num=1000000): 87 | """Retrieves a list of messages from Telegram dialogue. 88 | 89 | Notes: 90 | Requires a ready-to-use Telegram secrets (id, hash etc). 91 | Asks for target's id in a case this parameter is None. 92 | Retrieves a photo album as distinct messages with photos. 93 | 94 | Args: 95 | your_name (str): Your name. 96 | target_name (str): Target's name. 97 | loop (asyncio.windows_events._WindowsSelectorEventLoop, optional): An event loop. 98 | target_id (int,optional): Target's dialogue id. 99 | num (int,optional): No more than num NEWEST messages will be retrieved. 100 | 101 | Returns: 102 | A list of MyMessage objects (from older messages to newer). 103 | """ 104 | async with (await _get_client(loop=loop)) as client: 105 | if target_id is None: 106 | target_id = await _get_target_dialog_id(client) 107 | target_entity = await client.get_entity(target_id) 108 | log_line("Receiving Telegram messages...") 109 | telethon_messages = await _retrieve_messages(client, target_entity, num) 110 | messages = [_telethon_msg_to_mymessage(msg, target_id, your_name, target_name) for msg in telethon_messages] 111 | log_line(f"{len(messages)} Telegram messages were received") 112 | return messages 113 | 114 | 115 | async def _retrieve_messages(client, target_entity, num): 116 | """Retrieves messages from client's target_entity batch by batch and return them all.""" 117 | batch_size = min(3000, num) 118 | msgs = [] 119 | batch = await client.get_messages(target_entity, limit=batch_size) 120 | while len(batch) and len(msgs) < num: 121 | offset_id = batch[-1].id 122 | msgs.extend([msg for msg in batch if isinstance(msg, Message)]) 123 | try: 124 | batch = await asyncio.wait_for(client.get_messages(target_entity, limit=min(batch_size, num - len(msgs)), offset_id=offset_id), 10*60) 125 | except ConnectionError: 126 | log_line("Internet connection was lost.") 127 | raise 128 | except asyncio.TimeoutError: 129 | log_line("Telegram timeout error.") 130 | break 131 | if not len(batch): 132 | log_line(f"{len(msgs[:num])} (100%) messages received.") 133 | else: 134 | log_line(f"{len(msgs[:num])} ({len(msgs[:num])/batch.total*100:.2f}%) messages received.") 135 | return msgs[:num][::-1] 136 | 137 | 138 | async def _get_dialogs(client=None, loop=None): 139 | if client is None: 140 | async with (await _get_client(loop)) as client: 141 | return [dialog for dialog in list(await client.get_dialogs()) if dialog.is_user] 142 | return [dialog for dialog in list(await client.get_dialogs()) if dialog.is_user] 143 | 144 | 145 | async def _get_client(loop=None): 146 | """Creates a Telegram client based on current Telegram secrets. 147 | 148 | Returns: 149 | TelegramClient object. 150 | """ 151 | api_id, api_hash, phone_number, session_name = storage.get_telegram_secrets() 152 | if loop: 153 | client = TelegramClient(session_name, api_id, api_hash, loop=loop) 154 | else: 155 | client = TelegramClient(session_name, api_id, api_hash) 156 | await client.connect() 157 | 158 | if not await client.is_user_authorized(): 159 | await client.send_code_request(phone_number) 160 | await client.sign_in(phone_number, input("Please enter the code you received: ")) 161 | return client 162 | 163 | 164 | async def _get_target_dialog_id(client): 165 | """Interacts with user to get an id of the target's dialogue. 166 | 167 | Returns: 168 | Integer value of target's dialogue id. 169 | """ 170 | print("Here is a list of all your dialogues. Please find an id of a dialogue you want to analyse messages from.") 171 | for dialog in await get_str_dialogs(client): 172 | print(dialog) 173 | target_id = int(input("Input target dialog ID :")) 174 | return target_id 175 | 176 | 177 | def _telethon_msg_to_mymessage(msg, target_id, your_name, target_name): 178 | """Transforms telethon.tl.types.Message obj to MyMessage obj. 179 | 180 | Notes: 181 | An emoji representation of a sticker adds up to the message's text. 182 | 183 | Args: 184 | msg (telethon.tl.types.Message): A message. 185 | target_id (int): Target's dialogue id. 186 | your_name (str): Your name. 187 | target_name (str): Target's name. 188 | 189 | Returns: 190 | MyMessage obj. 191 | """ 192 | return MyMessage(msg.message + (msg.sticker.attributes[1].alt if msg.sticker is not None else ''), 193 | msg.date.replace(tzinfo=None) + relativedelta(hours=time_offset(msg.date)), 194 | target_name if msg.from_id == target_id else your_name, 195 | is_forwarded=msg.forward is not None, 196 | document_id=msg.document.id if msg.document is not None else None, 197 | has_sticker=msg.sticker is not None, 198 | has_video=msg.video is not None, 199 | has_voice=(msg.voice is not None and 200 | msg.document.mime_type == "audio/ogg"), 201 | has_audio=(msg.audio is not None and 202 | msg.document.mime_type != "audio/ogg"), # let audio != voice 203 | has_photo=msg.photo is not None) 204 | -------------------------------------------------------------------------------- /message_analyser/analyser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import datetime 4 | import message_analyser.plotter as plt 5 | import message_analyser.storage as storage 6 | import message_analyser.retriever.vkOpt as vkOpt 7 | import message_analyser.structure_tools as stools 8 | import message_analyser.retriever.telegram as tlg 9 | from message_analyser.misc import log_line, delay 10 | 11 | 12 | async def save_scalar_info(msgs, your_name, target_name, dir_path): 13 | """Saves scalar information about messages into a file. Additionally prints all the info to console. 14 | 15 | Args: 16 | msgs (list of MyMessage objects): Messages. 17 | your_name (str): Your name. 18 | target_name (str): Target's name. 19 | dir_path (str): A path to the file to store info in. 20 | """ 21 | with open(dir_path + "/scalar_info.csv", 'w', encoding="utf-8") as fp: 22 | day_messages = stools.get_messages_per_day(msgs) 23 | 24 | print_func = log_line 25 | 26 | fp.write(f"Start date:,{msgs[0].date}\n") 27 | print_func(f"{'Start date:'.ljust(25)}{msgs[0].date}") 28 | 29 | fp.write(f"Duration:,{str(msgs[-1].date - msgs[0].date).replace(',',' ')}\n") 30 | print_func(f"{'Duration:'.ljust(25)}{msgs[-1].date - msgs[0].date}") 31 | 32 | empty_days_num = len([day for day in day_messages if not day_messages[day]]) 33 | fp.write(f"Days without messages:,{empty_days_num},\n") 34 | print_func(f"{'Days without messages:'.ljust(25)}{empty_days_num}") 35 | 36 | most_active = max(day_messages, key=lambda day: len(day_messages[day])) 37 | fp.write(f"Most active day:,{most_active} : {len(day_messages[most_active])} messages\n") 38 | print_func(f"{'Most active day:'.ljust(25)}{most_active} : {len(day_messages[most_active])} messages") 39 | 40 | average = len(msgs) / len(day_messages) 41 | fp.write(f"Average messages per day:,{average:.2f} messages\n") 42 | print_func(f"{'Average messages per day:'.ljust(25)}{average:.2f} messages") 43 | 44 | max_delta, start_pause, end_pause = stools.get_longest_pause(msgs) 45 | fp.write(f"Longest pause:,{str(max_delta).replace(',',' ')} From {start_pause} to {end_pause}\n") 46 | print_func(f"{'Longest pause:'.ljust(25)}{max_delta} From {start_pause} to {end_pause}") 47 | 48 | fp.write(f"\nINFO,TOTAL,{your_name},{target_name}\n") 49 | print_func(f"{'INFO'.ljust(20)}{'TOTAL'.ljust(15)}{your_name:<15s}{target_name:<15s}") 50 | 51 | total_num = len(msgs) 52 | target_num = len([msg for msg in msgs if msg.author == target_name]) 53 | fp.write(f"All messages,{total_num},{total_num-target_num},{target_num}\n") 54 | print_func(f"{'All messages'.ljust(20)}{total_num:<15d}{total_num-target_num:<15d}{target_num:<15d}") 55 | 56 | msgs = stools.get_filtered(msgs, remove_forwards=True, remove_links=True, max_len=4095) 57 | 58 | total_chars = sum([len(msg.text) for msg in msgs if not msg.is_forwarded]) 59 | target_chars = sum([len(msg.text) for msg in msgs if not msg.is_forwarded and msg.author == target_name]) 60 | fp.write(f"Characters,{total_chars},{total_chars-target_chars},{target_chars}\n") 61 | print_func(f"{'Characters'.ljust(20)}{total_chars:<15d}{total_chars-target_chars:<15d}{target_chars:<15d}") 62 | 63 | total_photos = len([msg for msg in msgs if msg.has_photo]) 64 | target_photos = len([msg for msg in msgs if msg.has_photo and msg.author == target_name]) 65 | fp.write(f"Photos,{total_photos},{total_photos-target_photos},{target_photos}\n") 66 | print_func(f"{'Photos'.ljust(20)}{total_photos:<15d}{total_photos-target_photos:<15d}{target_photos:<15d}") 67 | 68 | total_stickers = len([msg for msg in msgs if msg.has_sticker]) 69 | target_stickers = len([msg for msg in msgs if msg.has_sticker and msg.author == target_name]) 70 | fp.write(f"Stickers,{total_stickers},{total_stickers-target_stickers},{target_stickers}\n") 71 | print_func((f"{'Stickers'.ljust(20)}{total_stickers:<15d}{total_stickers-target_stickers:<15d}" 72 | f"{target_stickers:<15d}")) 73 | 74 | total_songs = len([msg for msg in msgs if msg.has_audio]) 75 | target_songs = len([msg for msg in msgs if msg.has_audio and msg.author == target_name]) 76 | fp.write(f"Songs (audio files),{total_songs},{total_songs-target_songs},{target_songs}\n") 77 | print_func((f"{'Songs (audio files)'.ljust(20)}{total_songs:<15d}{total_songs-target_songs:<15d}" 78 | f"{target_songs:<15d}")) 79 | 80 | total_voice = len([msg for msg in msgs if msg.has_voice]) 81 | target_voice = len([msg for msg in msgs if msg.has_voice and msg.author == target_name]) 82 | fp.write(f"Voice messages,{total_voice},{total_voice-target_voice},{target_voice}\n") 83 | print_func(f"{'Voice messages'.ljust(20)}{total_voice:<15d}{total_voice-target_voice:<15d}{target_voice:<15d}") 84 | 85 | total_video = len([msg for msg in msgs if msg.has_video]) 86 | target_video = len([msg for msg in msgs if msg.has_video and msg.author == target_name]) 87 | fp.write(f"Video messages,{total_video},{total_video-target_video},{target_video}\n") 88 | print_func(f"{'Video messages'.ljust(20)}{total_video:<15d}{total_video-target_video:<15d}{target_video:<15d}") 89 | 90 | log_line(f"Scalar info was saved into {dir_path}/scalar_info.csv file.") 91 | 92 | 93 | async def _plot_messages_distribution(msgs, your_name, target_name, results_directory): 94 | """Shows how messages are distributed.""" 95 | plt.heat_map(msgs, results_directory) 96 | await asyncio.sleep(delay) 97 | plt.pie_messages_per_author(msgs, your_name, target_name, results_directory) 98 | await asyncio.sleep(delay) 99 | plt.stackplot_non_text_messages_percentage(msgs, results_directory) 100 | await asyncio.sleep(delay) 101 | plt.barplot_non_text_messages(msgs, results_directory) 102 | await asyncio.sleep(delay) 103 | plt.barplot_messages_per_weekday(msgs, your_name, target_name, results_directory) 104 | await asyncio.sleep(delay) 105 | plt.barplot_messages_per_day(msgs, results_directory) 106 | await asyncio.sleep(delay) 107 | plt.barplot_messages_per_minutes(msgs, results_directory) 108 | await asyncio.sleep(delay) 109 | plt.barplot_non_text_messages(msgs, results_directory) 110 | await asyncio.sleep(delay) 111 | plt.distplot_messages_per_hour(msgs, results_directory) 112 | await asyncio.sleep(delay) 113 | plt.distplot_messages_per_month(msgs, results_directory) 114 | await asyncio.sleep(delay) 115 | plt.distplot_messages_per_day(msgs, results_directory) 116 | await asyncio.sleep(delay) 117 | plt.lineplot_messages(msgs, your_name, target_name, results_directory) 118 | await asyncio.sleep(delay) 119 | log_line("Messages distribution was analysed.") 120 | 121 | 122 | async def _plot_messages_distribution_content_based(msgs, your_name, target_name, results_directory): 123 | """Shows how some characteristics of messages content are distributed.""" 124 | plt.lineplot_message_length(msgs, your_name, target_name, results_directory) 125 | await asyncio.sleep(delay) 126 | plt.barplot_emojis(msgs, your_name, target_name, 10, results_directory) 127 | await asyncio.sleep(delay) 128 | log_line("Content based messages distribution was analysed.") 129 | 130 | 131 | async def _plot_words_distribution(msgs, your_name, target_name, results_directory, words): 132 | """Shows how some words are distributed among the users.""" 133 | plt.barplot_words(msgs, your_name, target_name, words, 10, results_directory) 134 | await asyncio.sleep(delay) 135 | plt.wordcloud(msgs, words, results_directory) 136 | await asyncio.sleep(delay) 137 | log_line("Words distribution was analysed.") 138 | 139 | 140 | async def _plot_all(msgs, your_name, target_name, results_directory, words_file): 141 | await save_scalar_info(msgs, your_name, target_name, results_directory) 142 | await asyncio.sleep(delay) 143 | await _plot_messages_distribution(msgs, your_name, target_name, results_directory) 144 | await asyncio.sleep(delay) 145 | 146 | filtered_msgs = stools.get_filtered(msgs, remove_forwards=True, remove_empty=True, remove_links=True, max_len=4095) 147 | 148 | await _plot_messages_distribution_content_based(filtered_msgs, your_name, target_name, results_directory) 149 | await asyncio.sleep(delay) 150 | if words_file: 151 | words = storage.get_words(words_file) 152 | if words: 153 | await _plot_words_distribution(filtered_msgs, your_name, target_name, results_directory, words) 154 | await asyncio.sleep(delay) 155 | 156 | 157 | async def _get_all_messages(dialog, vkopt_file, your_name, target_name, loop): 158 | msgs = [] 159 | if dialog != -1: 160 | msgs.extend(await tlg.get_telegram_messages(your_name, target_name, loop=loop, target_id=dialog)) 161 | await asyncio.sleep(delay) 162 | if vkopt_file: 163 | msgs.extend(vkOpt.get_mymessages_from_file(your_name, target_name, vkopt_file)) 164 | await asyncio.sleep(delay) 165 | msgs.sort(key=lambda msg: msg.date) 166 | await asyncio.sleep(delay) 167 | return msgs 168 | 169 | 170 | def _save_words(msgs, your_name, target_name, path): 171 | total_words_cnt = stools.get_words_countered(msgs) 172 | top_words = [w for w, c in total_words_cnt.most_common(1000)] 173 | your_words_cnt = stools.get_words_countered([msg for msg in msgs if msg.author == your_name]) 174 | target_words_cnt = stools.get_words_countered([msg for msg in msgs if msg.author == target_name]) 175 | storage.store_top_words_count(top_words, your_words_cnt, target_words_cnt, path) 176 | 177 | 178 | async def _analyse(msgs, your_name, target_name, words_file, store_msgs=True, store_words=True): 179 | """Does analysis and stores results.""" 180 | log_line("Start messages analysis process.") 181 | 182 | if not len(msgs): 183 | log_line("No messages were received.") 184 | return 185 | date = datetime.datetime.today().strftime('%d-%m-%y %H-%M-%S') 186 | results_directory = os.path.join(os.path.split(os.path.normpath(os.path.dirname(__file__)))[0], "results", 187 | f"{date}_{your_name}_{target_name}") 188 | 189 | if not os.path.exists(results_directory): 190 | os.makedirs(results_directory) 191 | 192 | await asyncio.sleep(delay) 193 | 194 | if store_msgs: 195 | file_with_msgs = "messages.txt" 196 | storage.store_msgs(os.path.join(results_directory, file_with_msgs), msgs) 197 | if store_words: 198 | file_with_words = "words.txt" 199 | _save_words(msgs, your_name, target_name, os.path.join(results_directory, file_with_words)) 200 | 201 | await asyncio.sleep(delay) 202 | 203 | await _plot_all(msgs, your_name, target_name, results_directory, words_file) 204 | 205 | log_line("Done.") 206 | 207 | 208 | def analyse_from_file(path): 209 | """Analyses messages from a single file which was previously created by this program. 210 | 211 | Notes: 212 | Requires all the necessary configuration parameters (config.ini file) to be set either by GUI or manually. 213 | """ 214 | _, _, words_file, your_name, target_name = storage.get_session_params() 215 | msgs = storage.get_msgs(path) 216 | asyncio.get_event_loop().run_until_complete(_analyse(msgs, your_name, target_name, words_file, store_msgs=False)) 217 | 218 | 219 | async def retrieve_and_analyse(loop): 220 | """(async) Analyses messages from VkOpt file and/or Telegram dialogue. 221 | 222 | Notes: 223 | Requires all the necessary configuration parameters (config.ini file) to be set either by GUI or manually. 224 | """ 225 | dialog, vkopt_file, words_file, your_name, target_name = storage.get_session_params() 226 | msgs = await _get_all_messages(dialog, vkopt_file, your_name, target_name, loop) 227 | await _analyse(msgs, your_name, target_name, words_file) 228 | 229 | 230 | if __name__ == "__main__": 231 | asyncio.get_event_loop().run_until_complete(retrieve_and_analyse(asyncio.get_event_loop())) 232 | -------------------------------------------------------------------------------- /message_analyser/structure_tools.py: -------------------------------------------------------------------------------- 1 | import emoji 2 | import datetime 3 | import itertools 4 | from collections import Counter 5 | from dateutil.relativedelta import relativedelta 6 | 7 | MAX_MSG_LEN = 4096 8 | 9 | 10 | def count_months(msgs): 11 | """Returns the number of months between first and last messages (calendar months).""" 12 | r = relativedelta(msgs[-1].date, msgs[0].date) 13 | return r.months + 12 * r.years 14 | 15 | 16 | def get_filtered(msgs, 17 | remove_empty=False, 18 | remove_links=False, 19 | remove_forwards=False, 20 | except_patterns=None, 21 | except_samples=None, 22 | min_len=0, 23 | max_len=MAX_MSG_LEN 24 | ): 25 | """Filters a list of messages by different parameters. 26 | 27 | Notes: 28 | Patterns and samples are lowered as well as the messages they are compared to. 29 | 30 | Args: 31 | msgs (list of MyMessage objects): Messages to sort. 32 | remove_empty (bool): Skips/keeps messages with empty text component. 33 | remove_links (bool): Skips/keeps messages which are links. 34 | remove_forwards (bool): Skips/keeps messages which are forwarded. 35 | except_patterns (list of sets of strings (characters)): 36 | Skips messages which are made ONLY from the characters from any set in this list. 37 | except_samples (list of strings): 38 | Skips messages which are equal to any string in this list. 39 | min_len (int): Skips/keeps messages shorter than min_len. 40 | max_len (int): Skips/keeps messages longer than max_len. 41 | 42 | Returns: 43 | A list of MyMessage objects. 44 | """ 45 | if except_patterns is not None: 46 | except_patterns = set(pattern.lower() for pattern in except_patterns) 47 | if except_samples is not None: 48 | except_samples = list(sample.lower() for sample in except_samples) 49 | return list(filter(lambda msg: 50 | (not remove_empty or msg.text != "") 51 | and min_len <= len(msg.text) <= max_len 52 | and not (remove_forwards and msg.is_forwarded) 53 | and not (remove_links and msg.is_link) 54 | and (except_patterns is None or not any(set(msg.text.lower()) == p for p in except_patterns)) 55 | and (except_samples is None or not any(sample == msg.text for sample in except_samples)), 56 | msgs)) 57 | 58 | 59 | def get_non_text_messages_grouped(groups): 60 | """Filters and structures messages for each group and non-text message type. 61 | 62 | Args: 63 | groups (list of lists of MyMessage objects): Messages grouped. 64 | 65 | Returns: 66 | A list of message types grouped: 67 | [ 68 | { 69 | "groups": [list of numbers of specific messages in each group], 70 | "type": string type of these messages. 71 | } 72 | ] 73 | """ 74 | return [ 75 | {"groups": [len(list(filter(lambda m: m.has_audio, group))) for group in groups], 76 | "type": "audio"}, 77 | {"groups": [len(list(filter(lambda m: m.has_voice, group))) for group in groups], 78 | "type": "voice"}, 79 | {"groups": [len(list(filter(lambda m: m.has_photo, group))) for group in groups], 80 | "type": "photo"}, 81 | {"groups": [len(list(filter(lambda m: m.has_video, group))) for group in groups], 82 | "type": "video"}, 83 | {"groups": [len(list(filter(lambda m: m.has_sticker, group))) for group in groups], 84 | "type": "sticker"}, 85 | {"groups": [len(list(filter(lambda m: m.is_link, group))) for group in groups], 86 | "type": "link"} 87 | ] 88 | 89 | 90 | def get_response_speed_per_timedelta(msgs, name): 91 | """Gets list of response time lengths of a certain person. 92 | 93 | Notes: 94 | This function is not used anywhere (at the time when this docstring was written) because it needs 95 | better algorithm for making decisions about message being a response or not. 96 | 97 | Args: 98 | msgs (list of MyMessage objects): Messages. 99 | name (str): The name of the person whose response time is calculated. 100 | 101 | Returns: 102 | A a list of the person's (name) response time lengths. 103 | """ 104 | res = [] 105 | i = 0 106 | if msgs[0].author == name: 107 | while i < len(msgs) and msgs[i].author == name: 108 | i += 1 109 | while i < len(msgs): 110 | while i < len(msgs) and msgs[i].author != name: 111 | i += 1 112 | if i < len(msgs) and (msgs[i].date - msgs[i - 1].date).seconds <= 4 * 3600: # because people sleep sometimes 113 | res.append((msgs[i].date - msgs[i - 1].date).seconds / 60) 114 | while i < len(msgs) and msgs[i].author == name: 115 | i += 1 116 | return res 117 | 118 | 119 | def get_messages_per_timedelta(msgs, time_bin): 120 | """Gets lists of messages for each time interval with a given length. For example: 121 | time_bin is 7, so we will get lists of messages for each week between the first and last messages. 122 | 123 | Args: 124 | msgs (list of MyMessage objects): Messages. 125 | time_bin (int): The number of days in each bin (time interval). 126 | 127 | Returns: 128 | A dictionary such as: 129 | { 130 | day (datetime.date object): a list of messages within interval [day, day + time_bin) 131 | } 132 | """ 133 | start_d = msgs[0].date.date() 134 | current_date = start_d 135 | end_d = msgs[-1].date.date() 136 | res = dict() 137 | while current_date <= end_d: 138 | res[current_date] = [] 139 | current_date += relativedelta(days=time_bin) 140 | for msg in msgs: 141 | res[start_d + relativedelta(days=(msg.date.date() - start_d).days // time_bin * time_bin)].append(msg) 142 | return res 143 | 144 | 145 | def get_months(msgs): 146 | """Gets months (first day for each month) between the first and the last messages in a list. 147 | 148 | Notes: 149 | ATTENTION: datetime objects have day parameter set to 1 (first day of the month) for EACH month. 150 | Args: 151 | msgs (list of Mymessage objects): Messages. 152 | 153 | Returns: 154 | A list of datetime.date objects. 155 | """ 156 | start_d = msgs[0].date.date() 157 | end_d = msgs[-1].date.date() 158 | res = [] 159 | month, year = start_d.month, start_d.year 160 | while (year < end_d.year or not month > end_d.month) and year <= end_d.year: 161 | res.append(datetime.date(year, month, 1)) 162 | if month == 12: 163 | year += 1 164 | month = 0 165 | month += 1 166 | return res 167 | 168 | 169 | def get_weeks(msgs): 170 | """Gets weeks (first day for each week) between the first and last messages in a list. 171 | 172 | Notes: 173 | First "week" is 7-days full. 174 | This function returns calendar weeks, not just 7-days intervals. 175 | Args: 176 | msgs (list of Mymessage objects): Messages. 177 | 178 | Returns: 179 | A list of datetime.date objects. 180 | """ 181 | current_date = msgs[0].date.date() 182 | end_d = msgs[-1].date.date() 183 | res = [] 184 | if current_date.weekday() != 0: 185 | current_date -= relativedelta(days=current_date.weekday()) 186 | while current_date <= end_d: 187 | res.append(current_date) 188 | current_date += relativedelta(days=7) 189 | return res 190 | 191 | 192 | def str_day(day): 193 | """Transforms datetime day object into a "%d/%m/%y" string. 194 | 195 | Args: 196 | day (datetime/datetime.date): Day. 197 | 198 | Returns: 199 | A "%d/%m/%y" string representation. 200 | """ 201 | return day.strftime("%d/%m/%y") 202 | 203 | 204 | def date_days_to_str_days(days): 205 | """Transforms a list of datetime objects into a list of "%d/%m/%y" strings. 206 | 207 | Args: 208 | days (list of datetime objects): Days. 209 | 210 | Returns: 211 | A list of "%d/%m/%y" days representations. 212 | """ 213 | return [str_day(day) for day in days] 214 | 215 | 216 | def str_month(month): 217 | """Transforms datetime month object into a "%m/%y" string. 218 | 219 | Args: 220 | month (datetime/datetime.date): Month. 221 | 222 | Returns: 223 | A "%m/%y" string representation. 224 | """ 225 | return month.strftime("%m/%y") 226 | 227 | 228 | def date_months_to_str_months(months): 229 | """Transforms a list of datetime objects into a list of "%m/%y" strings. 230 | 231 | Args: 232 | months (list of datetime objects): Months. 233 | 234 | Returns: 235 | A list of "%m/%y" months representations. 236 | """ 237 | return [str_month(month) for month in months] 238 | 239 | 240 | def get_messages_per_month(msgs): 241 | """Gets lists of messages for each month between the first and last message. 242 | 243 | Notes: 244 | Months keys are set to the first day of the month. 245 | 246 | Args: 247 | msgs (list of Mymessage objects): Messages. 248 | 249 | Returns: 250 | A dictionary such as: 251 | { 252 | month (datetime.date): list of messages within this month 253 | } 254 | """ 255 | res = dict() 256 | current_date = msgs[0].date.date().replace(day=1) 257 | end_d = msgs[-1].date.date().replace(day=1) 258 | while current_date <= end_d: 259 | res[current_date] = [] 260 | current_date += relativedelta(months=1) 261 | 262 | for msg in msgs: 263 | res[msg.date.date().replace(day=1)].append(msg) 264 | return res 265 | 266 | 267 | def get_messages_per_week(msgs): 268 | """Gets lists of messages for each calendar week between the first and the last message. 269 | 270 | Args: 271 | msgs (list of Mymessage objects): Messages. 272 | 273 | Returns: 274 | A dictionary such as: 275 | { 276 | week (datetime.date): list of messages within this week 277 | } 278 | """ 279 | res = dict() 280 | current_date = msgs[0].date.date() 281 | end_d = msgs[-1].date.date() 282 | if current_date.weekday() != 0: 283 | current_date -= relativedelta(days=current_date.weekday()) 284 | while current_date <= end_d: 285 | res[current_date] = [] 286 | current_date += relativedelta(days=7) 287 | 288 | for msg in msgs: 289 | res[msg.date.date() - relativedelta(days=msg.date.date().weekday())].append(msg) 290 | return res 291 | 292 | 293 | def get_messages_per_minutes(msgs, minutes): 294 | """Gets lists of messages for each interval in minutes. 295 | 296 | Args: 297 | msgs (list of MyMessage objects): Messages. 298 | minutes (int): The number of minutes in one interval. 299 | 300 | Returns: 301 | A dictionary such as: 302 | { 303 | minute: list off all messages sent within interval [minute, minute + minutes). 304 | } 305 | """ 306 | res = {i: [] for i in range(0, 24 * 60, minutes)} 307 | for msg in msgs: 308 | res[(msg.date.hour * 60 + msg.date.minute) // minutes * minutes].append(msg) 309 | return res 310 | 311 | 312 | def get_messages_per_weekday(msgs): 313 | """Gets lists of messages for each day of the week (7 lists in a dictionary total). 314 | 315 | Args: 316 | msgs (list of MyMessage objects): Messages. 317 | 318 | Returns: 319 | A dictionary such as: 320 | { 321 | day_of_the_week (int 0-6): list off all messages sent on this day 322 | } 323 | """ 324 | res = {0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: []} 325 | for msg in msgs: 326 | res[msg.date.weekday()].append(msg) 327 | # placing Sunday at the end of the week # turned out we don't need it... 328 | # for i in [0, 1, 2, 3, 4, 5]: 329 | # res[i], res[(i + 6) % 7] = res[(i + 6) % 7], res[i] 330 | return res 331 | 332 | 333 | def get_messages_per_day(msgs): 334 | """Gets lists of messages for each day between the first and the last message. 335 | 336 | Notes: 337 | Days are stored in a dictionary as integers (first day is 0, second is 1 etc). 338 | 339 | Args: 340 | msgs (list of MyMessage objects): Messages. 341 | 342 | Returns: 343 | A dictionary such as: 344 | { 345 | day (int): list of messages sent this day 346 | } 347 | """ 348 | current_date = msgs[0].date.date() 349 | end_d = msgs[-1].date.date() 350 | res = dict() 351 | one_day = relativedelta(days=1) 352 | while current_date <= end_d: 353 | res[current_date] = [] 354 | current_date += one_day 355 | for msg in msgs: 356 | res[msg.date.date()].append(msg) 357 | return res 358 | 359 | 360 | def get_hours(): 361 | """Gets a list of str hours from 01:00 to 23:00""" 362 | return [f"{i:02d}:00" for i in range(24)] 363 | 364 | 365 | def get_messages_per_hour(msgs): 366 | """Gets lists of messages for each hour of the day (total 24 lists). 367 | 368 | Args: 369 | msgs (list of MyMessage objects): Messages. 370 | 371 | Returns: 372 | A dictionary such as: 373 | { 374 | hour (string "%H:00"): list of messages sent this hour (for all days) 375 | } 376 | """ 377 | res = {hour: [] for hour in get_hours()} 378 | for msg in msgs: 379 | res[f"{msg.date.hour:02d}:00"].append(msg) 380 | return res 381 | 382 | 383 | def get_longest_pause(msgs): 384 | """Gets the longest time distance between two consecutive messages. 385 | 386 | Args: 387 | msgs (list of MyMessage objects): Messages. 388 | 389 | Returns: 390 | A tuple such as: 391 | (timedelta of the longest pause in a dialogue, start datetime of the pause, end datetime of the pause). 392 | """ 393 | previous_date = msgs[0].date 394 | max_delta = datetime.datetime.today() - datetime.datetime.today() 395 | start_pause = end_pause = previous_date 396 | for msg in msgs[1:]: 397 | if msg.date - previous_date > max_delta: 398 | start_pause = previous_date 399 | end_pause = msg.date 400 | max_delta = msg.date - previous_date 401 | previous_date = msg.date 402 | return max_delta, start_pause, end_pause 403 | 404 | 405 | def _tokenize(text, stem=False, filters=None): 406 | """Tokenizes a text into a list of tokens (words). Words are lowered, punctuation and digits are removed. 407 | 408 | Notes: 409 | filters example: ["NOUN", "ADJF", "VERB", "ADVB"]. 410 | Stemming may work for ukrainian texts but now it is out-of-use. 411 | 412 | Args: 413 | text (str): A text to tokenize. 414 | stem (bool): True value means the words will be stemmed (currently out-of-use). 415 | filters (list of strings): List of string types of words (currently out-of-use). 416 | 417 | Returns: 418 | A list of words (strings). 419 | 420 | Raises: 421 | NotImplementedError: If you try to filter or stem. 422 | """ 423 | # import pymorphy2 424 | # import pymorphy2_dicts_uk 425 | # morph = pymorphy2.MorphAnalyzer(lang='uk') 426 | i = 0 427 | words = [] 428 | while i < len(text): 429 | word = "" 430 | while i < len(text) and (text[i].isalpha() or text[i] == '\'' or text[i] == '`'): 431 | word += text[i] 432 | i += 1 433 | if len(word) > 0: 434 | if stem or filters is not None: 435 | raise NotImplementedError 436 | # parsed = morph.parse(word.lower())[0] 437 | # if filters is None or any(el in parsed.tag for el in filters): 438 | # words.append(parsed.normal_form) 439 | else: 440 | words.append(word.lower()) 441 | i += 1 442 | return words 443 | 444 | 445 | def get_words_countered(msgs, stem=False): 446 | """Counts all words in messages. 447 | 448 | Notes: 449 | Punctuation and digits are removed, words are lowered and countered. 450 | 451 | Args: 452 | msgs (list of MyMessage objects): Messages. 453 | stem (bool): True value means the words will be stemmed (currently out-of-use). 454 | 455 | Returns: 456 | collections.Counter of words. 457 | """ 458 | return Counter(itertools.chain.from_iterable(_tokenize(msg.text, stem=stem) for msg in msgs)) 459 | 460 | 461 | def get_emoji_countered(msgs): 462 | """Counts all emojis in messages. 463 | 464 | Args: 465 | msgs (list of MyMessage objects): Messages. 466 | 467 | Returns: 468 | collections.Counter of emojis. 469 | """ 470 | cnt = Counter() 471 | for msg in msgs: 472 | for character in msg.text: 473 | if character in emoji.UNICODE_EMOJI: 474 | cnt[character] += 1 475 | return cnt 476 | 477 | 478 | def get_messages_lengths_countered(msgs): 479 | """Counts the length of each message. 480 | 481 | Args: 482 | msgs (list of MyMessage objects): Messages. 483 | 484 | Returns: 485 | collections.Counter of messages lengths. 486 | """ 487 | return Counter([len(msg.text) for msg in msgs]) 488 | -------------------------------------------------------------------------------- /message_analyser/GUI.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import asyncio 4 | import tkinter as tk 5 | import message_analyser.retriever.telegram as tlg 6 | import message_analyser.storage as storage 7 | from message_analyser import analyser 8 | from tkinter import filedialog 9 | 10 | 11 | async def start_gui(loop): 12 | app = MessageAnalyserGUI(tk.Tk(), loop) 13 | try: 14 | while True: 15 | # We want to update the application but get back 16 | # to asyncio's event loop. For this we sleep a 17 | # short time so the event loop can run. 18 | # 19 | # https://www.reddit.com/r/Python/comments/33ecpl 20 | # print("UPDATED!") 21 | app.update() 22 | await asyncio.sleep(0.05) 23 | except KeyboardInterrupt: 24 | pass 25 | except tk.TclError as e: 26 | if "application has been destroyed" not in e.args[0]: 27 | raise 28 | 29 | 30 | class LoggingToGUI(logging.Handler): 31 | """ Used to redirect logging output to the widget passed in parameters """ 32 | 33 | # https://stackoverflow.com/a/18194597 34 | 35 | def __init__(self, console): 36 | logging.Handler.__init__(self) 37 | 38 | self.console = console # Any text widget, you can use the class above or not 39 | 40 | def emit(self, message): # Overwrites the default handler's emit method 41 | formatted_message = self.format(message) # You can change the format here 42 | 43 | # Disabling states so no user can write in it 44 | self.console.configure(state=tk.NORMAL) 45 | self.console.insert(tk.END, formatted_message) # Inserting the logger message in the widget 46 | self.console.configure(state=tk.DISABLED) 47 | self.console.see(tk.END) 48 | # print(message) # You can just print to STDout in your overriden emit no need for black magic 49 | 50 | 51 | class MessageAnalyserGUI(tk.Frame): 52 | """Represents a GUI for the message analyser app. 53 | 54 | Contains next frames: 55 | A frame with a greeting and choosing of the base analyser parameters (raise_start_frame). 56 | A frame to set analyser attributes based on previous frame results (raise_files_frame). 57 | A frame to make an initial sign-in into Telegram client (raise_telegram_auth_frame, optional). 58 | A frame to choose a Telegram dialogue to analyse messages from (raise_dialogs_select_frame, optional). 59 | A frame to show analysing process and results (raise_finish_frame). 60 | 61 | Attributes: 62 | parent (tk.Frame): A root frame (tk.Tk()) of a tkinter app. 63 | loop (asyncio.windows_events._WindowsSelectorEventLoop, optional): An event loop. 64 | x (int): A horizontal size of the window. 65 | y (int): A vertical size of the window. 66 | session_params (dict): 67 | A dictionary which contains all the message analyser parameters for their future processing. Looks like: 68 | { 69 | "from_vk": (bool) True if some messages will be received from the VkOpt file., 70 | "from_telegram": (bool) True if some messages will be received from the Telegram., 71 | "plot_words": (bool) True if we need a file with words for future analysis of them., 72 | "dialogue": (str,optional) String representation ("dialog_name (id=dialog_id)") of a Telegram dialogue., 73 | "vkopt_file": (str,optional) A path to the file with VkOpt messages., 74 | "words_file": (str,optional) A path to the file with words., 75 | "your_name": (str) Your name., 76 | "target_name": (str) Target's name. 77 | } 78 | """ 79 | 80 | def __init__(self, parent, loop, *args, **kwargs): 81 | """Inits MessageAnalyserGUI class with parent frame and basic attributes. Raises an initial frame of the GUI.""" 82 | tk.Frame.__init__(self, parent, *args, **kwargs) 83 | self.parent = parent 84 | self.parent.title("Message analyser") 85 | self.x, self.y = 700, 500 86 | self.parent.geometry(f"{self.x}x{self.y}") 87 | self.parent.grid_columnconfigure(3, weight=8) 88 | self.parent.resizable(False, False) 89 | self.default_font_name = "Courier" 90 | self.default_font = (self.default_font_name, 11) 91 | self.button_background = "#ccccff" 92 | self.aio_loop = loop 93 | 94 | self.session_params = dict() 95 | 96 | self.raise_start_frame() 97 | 98 | def __set_file_path(self, label_text, file): 99 | """Stores file path in session parameters and changes the corresponding label text.""" 100 | self.session_params[file] = filedialog.askopenfilename(title=file, filetypes=[("Text files", ".txt")]) 101 | label_text.set("File : " + os.path.split(self.session_params[file])[-1]) 102 | 103 | def raise_start_frame(self): 104 | """Chooses base analyser parameters (do or do not analyse Telegram messages/vk.com messages/words).""" 105 | labels_frame = tk.Frame() 106 | labels_frame.pack(side=tk.TOP) 107 | 108 | start_label = tk.Label(labels_frame, text="Hi!\nLet's get started", 109 | height=2, width=35, font=(self.default_font_name, 20)) 110 | start_label.pack() 111 | 112 | start_label = tk.Label(labels_frame, text="What do You want to analyse?", 113 | height=2, width=35, font=(self.default_font_name, 15)) 114 | start_label.pack() 115 | 116 | check_boxes_frame = tk.Frame() 117 | check_boxes_frame.pack(anchor=tk.W) 118 | from_telegram = tk.BooleanVar() 119 | telegram_check_button = tk.Checkbutton(check_boxes_frame, text="Messages from Telegram", variable=from_telegram, 120 | font=self.default_font) 121 | telegram_check_button.pack(anchor=tk.W) 122 | if "from_telegram" in self.session_params and self.session_params["from_telegram"]: 123 | telegram_check_button.select() 124 | 125 | from_vk = tk.BooleanVar() 126 | vk_check_button = tk.Checkbutton(check_boxes_frame, text="Messages from vkOpt text file", variable=from_vk, 127 | font=self.default_font) 128 | vk_check_button.pack(anchor=tk.W) 129 | if "from_vk" in self.session_params and self.session_params["from_vk"]: 130 | vk_check_button.select() 131 | 132 | plot_words = tk.BooleanVar() 133 | words_check_button = tk.Checkbutton(check_boxes_frame, text="Add file with words", variable=plot_words, 134 | font=self.default_font) 135 | words_check_button.pack(anchor=tk.W) 136 | if "plot_words" in self.session_params and self.session_params["plot_words"]: 137 | words_check_button.select() 138 | 139 | def set_data_and_continue(): 140 | if from_vk.get() or from_telegram.get(): 141 | self.session_params["plot_words"] = plot_words.get() 142 | self.session_params["from_vk"] = from_vk.get() 143 | self.session_params["from_telegram"] = from_telegram.get() 144 | bottom_frame.destroy() 145 | labels_frame.destroy() 146 | check_boxes_frame.destroy() 147 | return self.raise_files_frame() 148 | telegram_check_button.config(fg="red") 149 | vk_check_button.config(fg="red") 150 | 151 | bottom_frame = tk.Frame() 152 | bottom_frame.pack(side=tk.BOTTOM) 153 | continue_button = tk.Button(bottom_frame, text="Continue", command=set_data_and_continue, 154 | padx=35, background=self.button_background, font=self.default_font) 155 | continue_button.pack(side=tk.BOTTOM) 156 | self.parent.bind('', lambda _: set_data_and_continue()) 157 | 158 | def raise_files_frame(self): 159 | """Chooses a file with words and a file with VkOpt messages; assigns names.""" 160 | table_frame = tk.Frame() 161 | table_frame.pack(expand=True, fill="both") 162 | 163 | cur_row = 0 164 | if self.session_params["from_vk"]: 165 | cur_row += 1 166 | vkopt_label = tk.Label(table_frame, text="Choose path to:", height=2, font=self.default_font) 167 | vkopt_label.grid(row=cur_row, column=1, sticky=tk.W) 168 | 169 | vkopt_button = tk.Button(table_frame, text="vkOpt file", 170 | command=lambda: self.__set_file_path(vkopt_filename_label_text, "vkopt_file"), 171 | font=self.default_font) 172 | vkopt_button.grid(row=cur_row, column=2, sticky=tk.W) 173 | 174 | cur_row += 1 175 | vkopt_filename_label_text = tk.StringVar() 176 | vkopt_filename_label_text.set("File : ") 177 | vkopt_filename_label = tk.Label(table_frame, textvariable=vkopt_filename_label_text, height=2, 178 | font=self.default_font) 179 | vkopt_filename_label.grid(row=cur_row, column=1, sticky=tk.W, columnspan=30) 180 | 181 | if self.session_params["plot_words"]: 182 | cur_row += 1 183 | words_label = tk.Label(table_frame, text="Choose path to:", height=2, font=self.default_font) 184 | words_label.grid(row=cur_row, column=1, sticky=tk.W) 185 | 186 | words_button = tk.Button(table_frame, text="words file", 187 | command=lambda: self.__set_file_path(words_filename_label_text, "words_file"), 188 | font=self.default_font) 189 | words_button.grid(row=cur_row, column=2, sticky=tk.W) 190 | 191 | cur_row += 1 192 | words_filename_label_text = tk.StringVar() 193 | words_filename_label_text.set("File : ") 194 | words_filename_label = tk.Label(table_frame, textvariable=words_filename_label_text, height=2, 195 | font=self.default_font) 196 | words_filename_label.grid(row=cur_row, column=1, sticky=tk.W, columnspan=30) 197 | 198 | _, _, _, your_name, target_name = storage.get_session_params() 199 | 200 | cur_row += 1 201 | your_name_label = tk.Label(table_frame, text="Your name: ", height=2, font=self.default_font) 202 | your_name_label.grid(row=cur_row, column=1, sticky=tk.W) 203 | 204 | your_name_dir = tk.Entry(table_frame, width=40, font=self.default_font) 205 | your_name_dir.insert(tk.END, your_name) 206 | your_name_dir.grid(row=cur_row, column=2) 207 | 208 | cur_row += 1 209 | target_name_label = tk.Label(table_frame, text="Target's name: ", height=2, font=self.default_font) 210 | target_name_label.grid(row=cur_row, column=1, sticky=tk.W) 211 | 212 | target_name_dir = tk.Entry(table_frame, width=40, font=self.default_font) 213 | target_name_dir.insert(tk.END, target_name) 214 | target_name_dir.grid(row=cur_row, column=2) 215 | 216 | if self.session_params["from_vk"]: 217 | cur_row += 1 218 | names_label = tk.Label(table_frame, text=("Please be sure these names are equal to the names in the \n" 219 | "vkOpt file. Otherwise vkOpt file will not be read correctly."), 220 | fg="red", height=2, font=self.default_font, justify="left") 221 | names_label.grid(row=cur_row, column=1, sticky=tk.W, columnspan=30) 222 | 223 | def set_data_and_continue(): 224 | your_name_label.config(fg="black") 225 | target_name_label.config(fg="black") 226 | if your_name_dir.get().isspace() or not your_name_dir.get(): 227 | return your_name_label.config(fg="red") 228 | if target_name_dir.get().isspace() or not target_name_dir.get(): 229 | return target_name_label.config(fg="red") 230 | 231 | if self.session_params["from_vk"]: 232 | if "vkopt_file" not in self.session_params: 233 | return vkopt_filename_label.config(fg="red") 234 | vkopt_filename_label.config(fg="black") 235 | 236 | if self.session_params["plot_words"]: 237 | if "words_file" not in self.session_params: 238 | return words_filename_label.config(fg="red") 239 | words_filename_label.config(fg="black") 240 | 241 | self.session_params["your_name"] = your_name_dir.get() 242 | self.session_params["target_name"] = target_name_dir.get() 243 | bottom_frame.destroy() 244 | table_frame.destroy() 245 | if self.session_params["from_telegram"]: 246 | return self.raise_telegram_auth_frame() 247 | self.raise_finish_frame() 248 | 249 | def raise_start_frame(): 250 | bottom_frame.destroy() 251 | table_frame.destroy() 252 | self.raise_start_frame() 253 | 254 | bottom_frame = tk.Frame() 255 | bottom_frame.pack(side=tk.BOTTOM) 256 | back_button = tk.Button(bottom_frame, text="Back", command=raise_start_frame, 257 | padx=35, background=self.button_background, font=self.default_font) 258 | back_button.pack(side=tk.LEFT) 259 | 260 | continue_button = tk.Button(bottom_frame, text="Continue", command=set_data_and_continue, 261 | padx=35, background=self.button_background, font=self.default_font) 262 | continue_button.pack(side=tk.RIGHT) 263 | self.parent.bind('', lambda _: set_data_and_continue()) 264 | 265 | def raise_telegram_auth_frame(self): 266 | """Makes an initial sign-in into Telegram client.""" 267 | table_frame = tk.Frame() 268 | table_frame.pack(expand=True, fill="both") 269 | 270 | assert self.session_params["from_telegram"] 271 | 272 | api_id, api_hash, phone_number, _ = storage.get_telegram_secrets() 273 | 274 | # A text in labels should be 15 characters long in order to not shift entries. 275 | # Should make them more adaptive some day. 276 | api_id_label = tk.Label(table_frame, text="API id : ", height=2, font=self.default_font) 277 | api_id_label.grid(row=1, column=1, sticky=tk.W) 278 | 279 | api_id_dir = tk.Entry(table_frame, width=46, font=self.default_font) 280 | api_id_dir.insert(tk.END, api_id) 281 | api_id_dir.grid(row=1, column=2, sticky=tk.W) 282 | 283 | api_hash_label = tk.Label(table_frame, text="API hash : ", height=2, font=self.default_font) 284 | api_hash_label.grid(row=2, column=1, sticky=tk.W) 285 | 286 | api_hash_dir = tk.Entry(table_frame, width=46, font=self.default_font) 287 | api_hash_dir.insert(tk.END, api_hash) 288 | api_hash_dir.grid(row=2, column=2, sticky=tk.W) 289 | 290 | phone_number_label = tk.Label(table_frame, text="Phone number : ", height=2, font=self.default_font) 291 | phone_number_label.grid(row=3, column=1, sticky=tk.W) 292 | 293 | phone_number_dir = tk.Entry(table_frame, width=46, font=self.default_font) 294 | phone_number_dir.insert(tk.END, phone_number) 295 | phone_number_dir.grid(row=3, column=2, sticky=tk.W) 296 | 297 | code_label = tk.Label(table_frame, text="Code : ", height=2, font=self.default_font) 298 | code_label.grid(row=4, column=1, sticky=tk.W) 299 | 300 | code_dir = tk.Entry(table_frame, width=46, font=self.default_font) 301 | code_dir.grid(row=4, column=2, sticky=tk.W) 302 | 303 | password_label = tk.Label(table_frame, text="Password : ", height=2, font=self.default_font) 304 | password_label.grid(row=5, column=1, sticky=tk.W) 305 | 306 | password_dir = tk.Entry(table_frame, width=46, font=self.default_font) 307 | password_dir.grid(row=5, column=2, sticky=tk.W) 308 | 309 | message_label_text = tk.StringVar() 310 | 311 | message_label_text.set(("Please be sure You have set the right API ID and key\n" 312 | "They can be obtained from:\n" 313 | "https://core.telegram.org/api/obtaining_api_id")) 314 | message_label = tk.Label(table_frame, textvariable=message_label_text, height=3, 315 | font=self.default_font, fg="red", justify="left") 316 | message_label.grid(row=6, column=1, sticky=tk.W, columnspan=2) 317 | 318 | async def try_sign_in_and_continue(): 319 | res = await tlg.get_sign_in_results(api_id_dir.get(), 320 | api_hash_dir.get(), 321 | code_dir.get(), 322 | phone_number_dir.get(), 323 | password_dir.get(), 324 | self.session_params["your_name"], 325 | loop=self.aio_loop) 326 | try: 327 | api_id_label.config(fg="black") 328 | except tk.TclError: # too fast "continue" button clicks? 329 | return 330 | api_hash_label.config(fg="black") 331 | phone_number_label.config(fg="black") 332 | code_label.config(fg="black") 333 | password_label.config(fg="black") 334 | if res == "wrong api": 335 | api_id_label.config(fg="red") 336 | api_hash_label.config(fg="red") 337 | return message_label_text.set("Please be sure You have set the right API ID and hash\n" 338 | "They can be obtained from:\n" 339 | "https://core.telegram.org/api/obtaining_api_id") 340 | elif res == "need phone": 341 | phone_number_label.config(fg="red") 342 | return message_label_text.set("Please carefully set Your phone number in order to \n" 343 | "get a confirmation code.\n ") 344 | 345 | elif res == "need code": 346 | code_label.config(fg="red") 347 | return message_label_text.set("Please check Your private messages (or SMS) and \n" 348 | "copypaste the right code.\n ") 349 | elif res == "need password": 350 | password_label.config(fg="red") 351 | return message_label_text.set("Please enter correct password.\n") 352 | elif res.startswith("need wait for "): 353 | return message_label_text.set(f'Please wait. A wait of {res[14:]} seconds is required.\n') 354 | elif res == "no internet": 355 | return message_label_text.set("Please be sure You have stable Internet connection.\n\n") 356 | 357 | assert res == "success" 358 | storage.store_telegram_secrets(api_id_dir.get(), api_hash_dir.get(), phone_number_dir.get(), 359 | session_name=self.session_params["your_name"]) 360 | bottom_frame.destroy() 361 | table_frame.destroy() 362 | self.aio_loop.create_task(self.raise_dialogs_select_frame()) 363 | 364 | bottom_frame = tk.Frame() 365 | bottom_frame.pack(side=tk.BOTTOM) 366 | continue_button = tk.Button(bottom_frame, text="Continue", 367 | command=lambda: self.aio_loop.create_task(try_sign_in_and_continue()), 368 | padx=35, background=self.button_background, 369 | font=self.default_font) 370 | continue_button.pack(side=tk.BOTTOM) 371 | self.parent.bind('', lambda _: self.aio_loop.create_task(try_sign_in_and_continue())) 372 | 373 | async def raise_dialogs_select_frame(self): 374 | """Chooses a Telegram dialogue to analyse messages from.""" 375 | table_frame = tk.Frame() 376 | table_frame.pack(expand=True, fill="both") 377 | 378 | dialog_select_label = tk.Label(table_frame, text="Please select a dialog You want to analyse messages from :", 379 | height=2, font=self.default_font) 380 | dialog_select_label.grid(row=1, column=1, sticky=tk.W) 381 | 382 | dialogs = await tlg.get_str_dialogs(loop=self.aio_loop) 383 | for i in range(len(dialogs)): 384 | dialogs[i] = ''.join(char for char in dialogs[i] if char < u"\uffff") 385 | 386 | dialog_variable = tk.StringVar() 387 | dialog_variable.set(dialogs[0]) # default value 388 | dialog_selection_menu = tk.OptionMenu(table_frame, dialog_variable, *dialogs) 389 | dialog_selection_menu.grid(row=2, column=1, sticky=tk.W) 390 | 391 | def select_dialog_and_continue(): 392 | self.session_params["dialogue"] = dialog_variable.get() 393 | bottom_frame.destroy() 394 | table_frame.destroy() 395 | self.raise_finish_frame() 396 | 397 | bottom_frame = tk.Frame() 398 | bottom_frame.pack(side=tk.BOTTOM) 399 | continue_button = tk.Button(bottom_frame, text="Continue", 400 | command=select_dialog_and_continue, padx=35, background=self.button_background, 401 | font=self.default_font) 402 | continue_button.pack(side=tk.BOTTOM) 403 | self.parent.bind('', lambda _: select_dialog_and_continue()) 404 | 405 | def raise_finish_frame(self): 406 | """Shows analysis process and results.""" 407 | table_frame = tk.Frame() 408 | table_frame.pack(expand=True, fill="both") 409 | 410 | finish_label = tk.Label(table_frame, 411 | text=("Plots and other data will be saved in a 'results' folder.\n" 412 | "Please, wait for the 'Done.' line. It takes some time..."), 413 | height=2, justify="left") 414 | finish_label.pack(anchor=tk.W) 415 | 416 | text_widget = tk.Text(table_frame) 417 | text_widget.pack(expand=True, fill="both") 418 | 419 | logger = logging.getLogger("message_analyser") 420 | logger.addHandler(LoggingToGUI(text_widget)) 421 | self.finalise() 422 | 423 | def finalise(self): 424 | storage.store_session_params(self.session_params) 425 | self.aio_loop.create_task(analyser.retrieve_and_analyse(self.aio_loop)) 426 | 427 | 428 | if __name__ == "__main__": 429 | aio_loop = asyncio.get_event_loop() 430 | try: 431 | aio_loop.run_until_complete(start_gui(aio_loop)) 432 | finally: 433 | if not aio_loop.is_closed(): 434 | aio_loop.close() 435 | -------------------------------------------------------------------------------- /message_analyser/plotter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import emoji 3 | import random 4 | import operator 5 | import numpy as np 6 | import pandas as pd 7 | import seaborn as sns 8 | import wordcloud as wc 9 | import matplotlib 10 | 11 | matplotlib.use("TkAgg") 12 | import matplotlib.cm as cm 13 | import matplotlib.pyplot as plt 14 | import matplotlib.colors as mpl_colors 15 | import message_analyser.structure_tools as stools 16 | from message_analyser.misc import avg, log_line, months_border 17 | 18 | 19 | def _change_bar_width(ax, new_value): 20 | # https://stackoverflow.com/a/44542112 21 | for patch in ax.patches: 22 | current_width = patch.get_width() 23 | diff = current_width - new_value 24 | 25 | # we change the bar width 26 | patch.set_width(new_value) 27 | 28 | # we recenter the bar 29 | patch.set_x(patch.get_x() + diff * .5) 30 | 31 | 32 | def heat_map(msgs, path_to_save, seasons=False): 33 | sns.set(style="whitegrid") 34 | 35 | messages_per_day = stools.get_messages_per_day(msgs) 36 | months = stools.date_months_to_str_months(stools.get_months(msgs)) 37 | heat_calendar = {month: np.array([None] * 31, dtype=np.float64) for month in months} 38 | for day, d_msgs in messages_per_day.items(): 39 | heat_calendar[stools.str_month(day)][day.day - 1] = len(d_msgs) 40 | 41 | # min_day = len(min(messages_per_day.values(), key=len)) 42 | max_day = len(max(messages_per_day.values(), key=len)) 43 | 44 | data = np.array(list(heat_calendar.values())) 45 | mask = np.array([np.array(arr, dtype=bool) for arr in data]) 46 | 47 | cmap = cm.get_cmap("Purples") 48 | 49 | center = max_day * 0.4 # (avg([len(d) for d in messages_per_day.values()]) + (max_day - min_day) / 2) / 2 50 | 51 | ax = sns.heatmap(data=data, cmap=cmap, center=center, xticklabels=True, yticklabels=True, 52 | square=True, linewidths=.2, cbar_kws={"shrink": .5}) 53 | 54 | # builds a mask to highlight empty days 55 | sns.heatmap(data, mask=mask, 56 | xticklabels=range(1, 32), 57 | yticklabels=months, 58 | linewidths=.2, cbar=False, cmap=mpl_colors.ListedColormap(["#ffffe6"])) 59 | 60 | if seasons: # divides heatmap on seasons 61 | season_lines = [i for i, m in enumerate(months) if m.month % 3 == 0 and i != 0] 62 | ax.hlines(season_lines, *ax.get_xlim(), colors=["b"]) 63 | ax.set(xlabel="day", ylabel="month") 64 | ax.margins(x=0) 65 | 66 | plt.tight_layout() 67 | fig = plt.gcf() 68 | fig.set_size_inches(11, 8) 69 | fig.savefig(os.path.join(path_to_save, heat_map.__name__ + ".png"), dpi=500) 70 | 71 | # plt.show() 72 | plt.close("all") 73 | log_line(f"{heat_map.__name__} was created.") 74 | 75 | 76 | def pie_messages_per_author(msgs, your_name, target_name, path_to_save): 77 | forwarded = len([msg for msg in msgs if msg.is_forwarded]) 78 | msgs = list(filter(lambda msg: not msg.is_forwarded, msgs)) 79 | your_messages_len = len([msg for msg in msgs if msg.author == your_name]) 80 | target_messages_len = len(msgs) - your_messages_len 81 | data = [your_messages_len, target_messages_len, forwarded] 82 | labels = [f"{your_name}\n({your_messages_len})", 83 | f"{target_name}\n({target_messages_len})", 84 | f"forwarded\n({forwarded})"] 85 | explode = (.0, .0, .2) 86 | 87 | fig, ax = plt.subplots(figsize=(13, 8), subplot_kw=dict(aspect="equal")) 88 | 89 | wedges, _, autotexts = ax.pie(x=data, explode=explode, colors=["#4982BB", "#5C6093", "#53B8D7"], 90 | autopct=lambda pct: f"{pct:.1f}%", 91 | wedgeprops={"edgecolor": "black", "alpha": 0.8}) 92 | 93 | ax.legend(wedges, labels, 94 | loc="upper right", 95 | bbox_to_anchor=(1, 0, 0.5, 1)) 96 | 97 | plt.setp(autotexts, size=10, weight="bold") 98 | 99 | fig.savefig(os.path.join(path_to_save, pie_messages_per_author.__name__ + ".png"), dpi=500) 100 | # plt.show() 101 | plt.close("all") 102 | log_line(f"{pie_messages_per_author.__name__} was created.") 103 | 104 | 105 | def _get_xticks(msgs, crop=True): 106 | start_date = msgs[0].date.date() 107 | xticks = [] 108 | months_num = stools.count_months(msgs) 109 | if months_num > months_border: 110 | xlabel = "month" 111 | months_ticks = stools.get_months(msgs) 112 | xticks_labels = stools.date_months_to_str_months(months_ticks) 113 | if (months_ticks[1] - start_date).days < 10 and crop: 114 | xticks_labels[0] = "" # remove first short month tick for better look 115 | for month in months_ticks: 116 | xticks.append(max(0, (month - start_date).days)) 117 | # it has max because start date is usually later than first month date. 118 | else: # too short message history -> we split data by weeks, not months 119 | xlabel = "week" 120 | weeks_ticks = stools.get_weeks(msgs) 121 | xticks_labels = stools.date_days_to_str_days(weeks_ticks) 122 | if len(weeks_ticks) > 2 and (weeks_ticks[1] - start_date).days < 3 and crop: 123 | xticks_labels[0] = "" # remove first short week tick for better look 124 | for date in weeks_ticks: 125 | xticks.append(max(0, (date - start_date).days)) 126 | # it has max because start date is usually later than first week date. 127 | 128 | return xticks, xticks_labels, xlabel 129 | 130 | 131 | def _get_plot_data(msgs): 132 | """Gets grouped data to plot. 133 | 134 | Returns: 135 | x, y (tuple): 136 | x is a list of values for the x axis. 137 | y is a list of groups of messages (for y axis). 138 | """ 139 | start_date = msgs[0].date.date() 140 | end_date = msgs[-1].date.date() 141 | xticks = [] 142 | months_num = stools.count_months(msgs) 143 | if months_num > months_border: 144 | messages_per_month = stools.get_messages_per_month(msgs) 145 | months_ticks = list(messages_per_month.keys()) 146 | for month in months_ticks: 147 | xticks.append(max(0, (month - start_date).days)) 148 | # it has max because start date is usually later than first month date. 149 | y = list(messages_per_month.values()) 150 | else: # too short message history -> we split data by weeks, not months 151 | messages_per_week = stools.get_messages_per_week(msgs) 152 | days_ticks = messages_per_week.keys() 153 | for date in days_ticks: 154 | xticks.append(max(0, (date - start_date).days)) 155 | # it has max because start date is usually later than first week date. 156 | y = list(messages_per_week.values()) 157 | 158 | # put x values at the middle of each bar (bin) 159 | x = [(xticks[i] + xticks[i + 1]) / 2 for i in range(1, len(xticks) - 1)] 160 | # except for the first and the last values 161 | x.insert(0, xticks[0]) 162 | if len(y) > 1: 163 | x.append((xticks[-1] + (end_date - start_date).days) / 2) 164 | 165 | return x, y 166 | 167 | 168 | def stackplot_non_text_messages_percentage(msgs, path_to_save): 169 | sns.set(style="whitegrid", palette="muted") 170 | 171 | colors = ['y', 'b', 'c', 'r', 'g', 'm'] 172 | 173 | (x, y_total), (xticks, xticks_labels, xlabel) = _get_plot_data(msgs), _get_xticks(msgs) 174 | 175 | stacks = stools.get_non_text_messages_grouped(y_total) 176 | 177 | # Normalize values 178 | for i in range(len(stacks[0]["groups"])): 179 | total = sum(stack["groups"][i] for stack in stacks) 180 | for stack in stacks: 181 | if not total: 182 | stack["groups"][i] = 0 183 | else: 184 | stack["groups"][i] /= total 185 | 186 | plt.stackplot(x, *[stack["groups"] for stack in stacks], labels=[stack["type"] for stack in stacks], 187 | colors=colors, alpha=0.7) 188 | 189 | plt.margins(0, 0) 190 | plt.xticks(xticks, rotation=65) 191 | plt.yticks([i / 10 for i in range(0, 11, 2)]) 192 | 193 | ax = plt.gca() 194 | ax.set_xticklabels(xticks_labels) 195 | ax.set_yticklabels([f"{i}%" for i in range(0, 101, 20)]) 196 | ax.tick_params(axis='x', bottom=True, color="#A9A9A9") 197 | ax.set(xlabel=xlabel, ylabel="non-text messages") 198 | 199 | # https://stackoverflow.com/a/4701285 200 | # Shrink current axis by 10% 201 | box = ax.get_position() 202 | ax.set_position([box.x0, box.y0, box.width * 0.9, box.height]) 203 | # Put a legend to the right of the current axis 204 | ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) 205 | 206 | fig = plt.gcf() 207 | fig.set_size_inches(11, 8) 208 | 209 | fig.savefig(os.path.join(path_to_save, stackplot_non_text_messages_percentage.__name__ + ".png"), dpi=500) 210 | # plt.show() 211 | log_line(f"{stackplot_non_text_messages_percentage.__name__} was created.") 212 | plt.close("all") 213 | 214 | 215 | def barplot_non_text_messages(msgs, path_to_save): 216 | sns.set(style="whitegrid", palette="muted") 217 | 218 | colors = ['y', 'b', 'c', 'r', 'g', 'm'] 219 | 220 | (x, y_total), (xticks, xticks_labels, xlabel) = _get_plot_data(msgs), _get_xticks(msgs, crop=False) 221 | 222 | bars = stools.get_non_text_messages_grouped(y_total) 223 | 224 | # bars are overlapping, so firstly we need to sum up the all... 225 | sum_bars = [0] * len(y_total) 226 | for bar in bars: 227 | sum_bars = list(map(operator.add, sum_bars, bar["groups"])) 228 | # ... plot and subtract one by one. 229 | for i, bar in enumerate(bars[:-1]): 230 | sns.barplot(x=xticks_labels, y=sum_bars, label=bar["type"], color=colors[i]) 231 | sum_bars = list(map(operator.sub, sum_bars, bar["groups"])) 232 | ax = sns.barplot(x=xticks_labels, y=sum_bars, label=bars[-1]["type"], color=colors[-1]) 233 | _change_bar_width(ax, 1.) 234 | 235 | # https://stackoverflow.com/a/4701285 236 | # Shrink current axis by 10% 237 | box = ax.get_position() 238 | ax.set_position([box.x0, box.y0, box.width * 0.9, box.height]) 239 | # Put a legend to the right of the current axis 240 | ax.legend(loc="center left", bbox_to_anchor=(1, 0.5)) 241 | 242 | ax.set_xticklabels(xticks_labels, ha="right") 243 | ax.set(xlabel=xlabel, ylabel="messages") 244 | 245 | plt.xticks(rotation=65) 246 | fig = plt.gcf() 247 | fig.set_size_inches(16, 8) 248 | 249 | fig.savefig(os.path.join(path_to_save, barplot_non_text_messages.__name__ + ".png"), dpi=500) 250 | # plt.show() 251 | log_line(f"{barplot_non_text_messages.__name__} was created.") 252 | plt.close("all") 253 | 254 | 255 | def barplot_messages_per_day(msgs, path_to_save): 256 | sns.set(style="whitegrid", palette="muted") 257 | sns.despine(top=True) 258 | 259 | messages_per_day_vals = stools.get_messages_per_day(msgs).values() 260 | 261 | xticks, xticks_labels, xlabel = _get_xticks(msgs) 262 | 263 | min_day = len(min(messages_per_day_vals, key=lambda day: len(day))) 264 | max_day = len(max(messages_per_day_vals, key=lambda day: len(day))) 265 | pal = sns.color_palette("Greens_d", max_day - min_day + 1)[::-1] 266 | 267 | ax = sns.barplot(x=list(range(len(messages_per_day_vals))), y=[len(day) for day in messages_per_day_vals], 268 | edgecolor="none", palette=np.array(pal)[[len(day) - min_day for day in messages_per_day_vals]]) 269 | _change_bar_width(ax, 1.) 270 | ax.set(xlabel=xlabel, ylabel="messages") 271 | ax.set_xticklabels(xticks_labels) 272 | 273 | ax.tick_params(axis='x', bottom=True, color="#A9A9A9") 274 | plt.xticks(xticks, rotation=65) 275 | 276 | fig = plt.gcf() 277 | fig.set_size_inches(20, 10) 278 | fig.savefig(os.path.join(path_to_save, barplot_messages_per_day.__name__ + ".png"), dpi=500) 279 | 280 | # plt.show() 281 | log_line(f"{barplot_messages_per_day.__name__} was created.") 282 | plt.close("all") 283 | 284 | 285 | def barplot_messages_per_minutes(msgs, path_to_save, minutes=2): 286 | sns.set(style="whitegrid", palette="muted") 287 | sns.despine(top=True) 288 | 289 | messages_per_minutes = stools.get_messages_per_minutes(msgs, minutes) 290 | 291 | xticks_labels = stools.get_hours() 292 | xticks = [i * 60 // minutes for i in range(24)] 293 | 294 | min_minutes = len(min(messages_per_minutes.values(), key=lambda day: len(day))) 295 | max_minutes = len(max(messages_per_minutes.values(), key=lambda day: len(day))) 296 | pal = sns.color_palette("GnBu_d", max_minutes - min_minutes + 1)[::-1] 297 | 298 | ax = sns.barplot(x=list(range(len(messages_per_minutes))), y=[len(day) for day in messages_per_minutes.values()], 299 | edgecolor="none", 300 | palette=np.array(pal)[[len(day) - min_minutes for day in messages_per_minutes.values()]]) 301 | _change_bar_width(ax, 1.) 302 | ax.set(xlabel="hour", ylabel="messages") 303 | ax.set_xticklabels(xticks_labels) 304 | 305 | ax.tick_params(axis='x', bottom=True, color="#A9A9A9") 306 | plt.xticks(xticks, rotation=65) 307 | 308 | fig = plt.gcf() 309 | fig.set_size_inches(20, 10) 310 | 311 | fig.savefig(os.path.join(path_to_save, barplot_messages_per_minutes.__name__ + ".png"), dpi=500) 312 | # plt.show() 313 | log_line(f"{barplot_messages_per_minutes.__name__} was created.") 314 | plt.close("all") 315 | 316 | 317 | def barplot_words(msgs, your_name, target_name, words, topn, path_to_save): 318 | sns.set(style="whitegrid") 319 | 320 | your_msgs = [msg for msg in msgs if msg.author == your_name] 321 | target_msgs = [msg for msg in msgs if msg.author == target_name] 322 | 323 | your_words_cnt = stools.get_words_countered(your_msgs) 324 | target_words_cnt = stools.get_words_countered(target_msgs) 325 | 326 | words.sort(key=lambda w: your_words_cnt[w] + target_words_cnt[w], reverse=True) 327 | df_dict = {"name": [], "word": [], "num": []} 328 | for word in words[:topn]: 329 | df_dict["word"].extend([word, word]) 330 | df_dict["name"].append(your_name) 331 | df_dict["num"].append(your_words_cnt[word]) 332 | df_dict["name"].append(target_name) 333 | df_dict["num"].append(target_words_cnt[word]) 334 | 335 | ax = sns.barplot(x="word", y="num", hue="name", data=pd.DataFrame(df_dict), palette="PuBu") 336 | ax.legend(ncol=1, loc="upper right", frameon=True) 337 | ax.set(ylabel="messages", xlabel='') 338 | 339 | fig = plt.gcf() 340 | fig.set_size_inches(14, 8) 341 | 342 | fig.savefig(os.path.join(path_to_save, barplot_words.__name__ + ".png"), dpi=500) 343 | # plt.show() 344 | log_line(f"{barplot_words.__name__} was created.") 345 | plt.close("all") 346 | 347 | 348 | def barplot_emojis(msgs, your_name, target_name, topn, path_to_save): 349 | sns.set(style="whitegrid") 350 | 351 | mc_emojis = stools.get_emoji_countered(msgs).most_common(topn) 352 | if not mc_emojis: 353 | return 354 | your_msgs = [msg for msg in msgs if msg.author == your_name] 355 | target_msgs = [msg for msg in msgs if msg.author == target_name] 356 | 357 | your_emojis_cnt = stools.get_emoji_countered(your_msgs) 358 | target_emojis_cnt = stools.get_emoji_countered(target_msgs) 359 | 360 | df_dict = {"name": [], "emoji": [], "num": []} 361 | for e, _ in mc_emojis: 362 | df_dict["emoji"].extend([emoji.demojize(e), emoji.demojize(e)]) 363 | df_dict["name"].append(your_name) 364 | df_dict["num"].append(your_emojis_cnt[e]) 365 | df_dict["name"].append(target_name) 366 | df_dict["num"].append(target_emojis_cnt[e]) 367 | 368 | ax = sns.barplot(x="num", y="emoji", hue="name", data=pd.DataFrame(df_dict), palette="PuBu") 369 | ax.set(ylabel="emoji name", xlabel="emojis") 370 | ax.legend(ncol=1, loc="lower right", frameon=True) 371 | 372 | fig = plt.gcf() 373 | fig.set_size_inches(11, 8) 374 | plt.tight_layout() 375 | 376 | fig.savefig(os.path.join(path_to_save, barplot_emojis.__name__ + ".png"), dpi=500) 377 | # plt.show() 378 | log_line(f"{barplot_emojis.__name__} was created.") 379 | plt.close("all") 380 | 381 | 382 | def barplot_messages_per_weekday(msgs, your_name, target_name, path_to_save): 383 | sns.set(style="whitegrid", palette="pastel") 384 | 385 | messages_per_weekday = stools.get_messages_per_weekday(msgs) 386 | labels = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] 387 | 388 | ax = sns.barplot(x=labels, y=[len(weekday) for weekday in messages_per_weekday.values()], 389 | label=your_name, color="b") 390 | sns.set_color_codes("muted") 391 | sns.barplot(x=labels, 392 | y=[len([msg for msg in weekday if msg.author == target_name]) 393 | for weekday in messages_per_weekday.values()], 394 | label=target_name, color="b") 395 | 396 | ax.legend(ncol=2, loc="lower right", frameon=True) 397 | ax.set(ylabel="messages") 398 | sns.despine(right=True, top=True) 399 | 400 | fig = plt.gcf() 401 | fig.set_size_inches(11, 8) 402 | 403 | fig.savefig(os.path.join(path_to_save, barplot_messages_per_weekday.__name__ + ".png"), dpi=500) 404 | # plt.show() 405 | log_line(f"{barplot_messages_per_weekday.__name__} was created.") 406 | plt.close("all") 407 | 408 | 409 | def distplot_messages_per_hour(msgs, path_to_save): 410 | sns.set(style="whitegrid") 411 | 412 | ax = sns.distplot([msg.date.hour for msg in msgs], bins=range(25), color="m", kde=False) 413 | ax.set_xticklabels(stools.get_hours()) 414 | ax.set(xlabel="hour", ylabel="messages") 415 | ax.margins(x=0) 416 | 417 | plt.xticks(range(24), rotation=65) 418 | plt.tight_layout() 419 | fig = plt.gcf() 420 | fig.set_size_inches(11, 8) 421 | 422 | fig.savefig(os.path.join(path_to_save, distplot_messages_per_hour.__name__ + ".png"), dpi=500) 423 | # plt.show() 424 | log_line(f"{distplot_messages_per_hour.__name__} was created.") 425 | plt.close("all") 426 | 427 | 428 | def distplot_messages_per_day(msgs, path_to_save): 429 | sns.set(style="whitegrid") 430 | 431 | data = stools.get_messages_per_day(msgs) 432 | 433 | max_day_len = len(max(data.values(), key=len)) 434 | ax = sns.distplot([len(day) for day in data.values()], bins=list(range(0, max_day_len, 50)) + [max_day_len], 435 | color="m", kde=False) 436 | ax.set(xlabel="messages", ylabel="days") 437 | ax.margins(x=0) 438 | 439 | fig = plt.gcf() 440 | fig.set_size_inches(11, 8) 441 | 442 | fig.savefig(os.path.join(path_to_save, distplot_messages_per_day.__name__ + ".png"), dpi=500) 443 | # plt.show() 444 | log_line(f"{distplot_messages_per_day.__name__} was created.") 445 | plt.close("all") 446 | 447 | 448 | def distplot_messages_per_month(msgs, path_to_save): 449 | sns.set(style="whitegrid") 450 | 451 | start_date = msgs[0].date.date() 452 | (xticks, xticks_labels, xlabel) = _get_xticks(msgs) 453 | 454 | ax = sns.distplot([(msg.date.date() - start_date).days for msg in msgs], 455 | bins=xticks + [(msgs[-1].date.date() - start_date).days], color="m", kde=False) 456 | ax.set_xticklabels(xticks_labels) 457 | ax.set(xlabel=xlabel, ylabel="messages") 458 | ax.margins(x=0) 459 | 460 | plt.xticks(xticks, rotation=65) 461 | plt.tight_layout() 462 | fig = plt.gcf() 463 | fig.set_size_inches(11, 8) 464 | 465 | fig.savefig(os.path.join(path_to_save, distplot_messages_per_month.__name__ + ".png"), dpi=500) 466 | # plt.show() 467 | log_line(f"{distplot_messages_per_month.__name__} was created.") 468 | plt.close("all") 469 | 470 | 471 | def lineplot_message_length(msgs, your_name, target_name, path_to_save): 472 | sns.set(style="whitegrid") 473 | 474 | (x, y_total), (xticks, xticks_labels, xlabel) = _get_plot_data(msgs), _get_xticks(msgs) 475 | 476 | y_your = [avg([len(msg.text) for msg in period if msg.author == your_name]) for period in y_total] 477 | y_target = [avg([len(msg.text) for msg in period if msg.author == target_name]) for period in y_total] 478 | 479 | plt.fill_between(x, y_your, alpha=0.3) 480 | ax = sns.lineplot(x=x, y=y_your, palette="denim blue", linewidth=2.5, label=your_name) 481 | plt.fill_between(x, y_target, alpha=0.3) 482 | sns.lineplot(x=x, y=y_target, linewidth=2.5, label=target_name) 483 | 484 | ax.set(xlabel=xlabel, ylabel="average message length (characters)") 485 | ax.set_xticklabels(xticks_labels) 486 | 487 | ax.tick_params(axis='x', bottom=True, color="#A9A9A9") 488 | plt.xticks(xticks, rotation=65) 489 | ax.margins(x=0, y=0) 490 | 491 | # plt.tight_layout() 492 | fig = plt.gcf() 493 | fig.set_size_inches(13, 7) 494 | 495 | fig.savefig(os.path.join(path_to_save, lineplot_message_length.__name__ + ".png"), dpi=500) 496 | # plt.show() 497 | plt.close("all") 498 | log_line(f"{lineplot_message_length.__name__} was created.") 499 | 500 | 501 | def lineplot_messages(msgs, your_name, target_name, path_to_save): 502 | sns.set(style="whitegrid") 503 | 504 | (x, y_total), (xticks, xticks_labels, xlabel) = _get_plot_data(msgs), _get_xticks(msgs) 505 | 506 | y_your = [len([msg for msg in period if msg.author == your_name]) for period in y_total] 507 | y_target = [len([msg for msg in period if msg.author == target_name]) for period in y_total] 508 | 509 | plt.fill_between(x, y_your, alpha=0.3) 510 | ax = sns.lineplot(x=x, y=y_your, palette="denim blue", linewidth=2.5, label=your_name) 511 | plt.fill_between(x, y_target, alpha=0.3) 512 | sns.lineplot(x=x, y=y_target, linewidth=2.5, label=target_name) 513 | 514 | ax.set(xlabel=xlabel, ylabel="messages") 515 | ax.set_xticklabels(xticks_labels) 516 | 517 | ax.tick_params(axis='x', bottom=True, color="#A9A9A9") 518 | plt.xticks(xticks, rotation=65) 519 | ax.margins(x=0, y=0) 520 | 521 | # plt.tight_layout() 522 | fig = plt.gcf() 523 | fig.set_size_inches(13, 7) 524 | 525 | fig.savefig(os.path.join(path_to_save, lineplot_messages.__name__ + ".png"), dpi=500) 526 | # plt.show() 527 | plt.close("all") 528 | log_line(f"{lineplot_messages.__name__} was created.") 529 | 530 | 531 | def wordcloud(msgs, words, path_to_save): 532 | all_words_list = [] 533 | words_cnt = stools.get_words_countered(msgs) 534 | # we need to create a huge string which contains each word as many times as it encounters in messages. 535 | for word in set(words): 536 | all_words_list.extend([word] * (words_cnt[word])) 537 | random.shuffle(all_words_list, random.random) # don't forget to shuffle ! 538 | 539 | if not all_words_list: 540 | log_line("No such words were found in message history.") 541 | return 542 | 543 | all_words_string = ' '.join(all_words_list) 544 | 545 | # the cloud will be a circle. 546 | radius = 500 547 | x, y = np.ogrid[:2 * radius, :2 * radius] 548 | mask = (x - radius) ** 2 + (y - radius) ** 2 > radius ** 2 549 | mask = 255 * mask.astype(int) 550 | 551 | word_cloud = wc.WordCloud(background_color="white", repeat=False, mask=mask) 552 | word_cloud.generate(all_words_string) 553 | 554 | plt.axis("off") 555 | plt.imshow(word_cloud, interpolation="bilinear") 556 | 557 | word_cloud.to_file(os.path.join(path_to_save, wordcloud.__name__ + ".png")) 558 | # plt.show() 559 | plt.close() 560 | log_line(f"{wordcloud.__name__} was created.") 561 | --------------------------------------------------------------------------------