├── .gitignore ├── requirements.txt ├── Dockerfile ├── example_config.json ├── LICENSE ├── summarization.py ├── prompts └── example_summarization_prompt.txt ├── README.md ├── app.py └── communication.py /.gitignore: -------------------------------------------------------------------------------- 1 | config.json 2 | prompts/ 3 | __pycache__/ 4 | .idea/ 5 | *.session 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Telethon==1.38.1 2 | langchain==0.3.14 3 | langchain-openai==0.2.14 4 | pydantic==2.10.4 5 | schedule==1.2.2 6 | pyTelegramBotAPI==4.26.0 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11 2 | 3 | COPY requirements.txt /app/ 4 | COPY app.py /app/ 5 | COPY summarization.py /app/ 6 | COPY communication.py /app/ 7 | COPY prompts/ /app/prompts/ 8 | COPY config.json /app/ 9 | 10 | RUN python3 -m pip install -r /app/requirements.txt 11 | 12 | WORKDIR /app 13 | 14 | CMD ["python3", "/app/app.py", "/app/config.json"] 15 | -------------------------------------------------------------------------------- /example_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "telegram_api_id": , 3 | "telegram_api_hash": "", 4 | "openai_api_key": "", 5 | "telegram_bot_auth_token": "", 6 | "chats_to_summarize": [ 7 | { 8 | "id": "", 9 | "lookback_period_seconds": 86400, 10 | "summarization_prompt_path": "prompts/example_summarization_prompt.txt" 11 | } 12 | ], 13 | "telegram_summary_receivers": [ 14 | "" 15 | ] 16 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Georgiy Manuilov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /summarization.py: -------------------------------------------------------------------------------- 1 | from langchain.prompts import ( 2 | ChatPromptTemplate, 3 | HumanMessagePromptTemplate, 4 | MessagesPlaceholder, 5 | ) 6 | from langchain_core.messages import SystemMessage 7 | from langchain_openai import ChatOpenAI 8 | from langchain.chains import LLMChain 9 | from langchain.memory import ConversationBufferMemory 10 | 11 | 12 | class Summarizer: 13 | def __init__(self, openai_api_key): 14 | self.openai_api_key = openai_api_key 15 | self.openai_model = "gpt-4-turbo-preview" 16 | 17 | # Needed to store chat history 18 | self.persistent_prompt = ChatPromptTemplate.from_messages( 19 | [ 20 | SystemMessage( 21 | content="You are a chatbot having a conversation with a human."), 22 | MessagesPlaceholder(variable_name="chat_history"), 23 | HumanMessagePromptTemplate.from_template("{human_input}") 24 | ] 25 | ) 26 | 27 | def summarize(self, text_to_summarize, summarization_prompt): 28 | memory = ConversationBufferMemory( 29 | memory_key="chat_history", return_messages=True) 30 | llm = ChatOpenAI(model_name=self.openai_model, openai_api_key=self.openai_api_key) 31 | chat_llm_chain = LLMChain( 32 | llm=llm, 33 | prompt=self.persistent_prompt, 34 | verbose=False, 35 | memory=memory, 36 | ) 37 | init_prompt = summarization_prompt.format(text_to_summarize=text_to_summarize) 38 | return chat_llm_chain.predict(human_input=init_prompt), chat_llm_chain 39 | 40 | @staticmethod 41 | def validate_summarization_prompt(summarization_prompt): 42 | if not "{text_to_summarize}" in summarization_prompt: 43 | raise RuntimeError("Summarization prompt should include \"{ text_to_summarize }\"") 44 | -------------------------------------------------------------------------------- /prompts/example_summarization_prompt.txt: -------------------------------------------------------------------------------- 1 | The text document below is a message history from a Telegram group chat. 2 | I need you to summarize this chat history and yield 5 primary conversation topics. 3 | Each conversation topic mentioned should be accompanied by a one-sentence summaries of 2-3 most representative dialogs (not single messages) from the conversation on the given topic including user names. For each dialog summary provide the exact keywords with which the message can be found in the history using text search. 4 | IMPORTANT: The output should be provided in the language which prevails in the messages text. 5 | 6 | Here's an example of desired output in Russian language (follow the exact structure): 7 | 8 | 1. Debugging late at night: Programmers share their experiences and challenges while fixing bugs in the wee hours. 9 | Example messages: 10 | - Mike jokes about how his code only works after midnight, suggesting a magical time for debugging. Keywords: "code", "magical debugging hour". 11 | - Sarah laments the endless cycle of finding one bug only to encounter another, echoing the never-ending nature of programming. Keywords: "endless cycle", "bug after bug". 12 | 13 | 2. The eternal debate between spaces and tabs for indentation. 14 | Example messages: 15 | - John passionately argues for spaces, citing consistency across different IDEs. Keywords: "spaces", "consistency", "IDEs". 16 | - Emily defends tabs, emphasizing customization and less file size. Keywords: "tabs", "customization", "file size". 17 | 18 | 3. Coffee as a programmer's best friend: Discussions revolve around the importance of coffee in coding sessions. 19 | Example messages: 20 | - Kevin shares his routine of coding with a freshly brewed cup, claiming it boosts his productivity. Keywords: "coffee", "coding", "productivity". 21 | - Lisa mentions the dire consequences of running out of coffee during a project, half-jokingly suggesting it’s worse than a major bug. Keywords: "out of coffee", "worse than bug". 22 | 23 | 4. The challenge of explaining programming to non-tech family members. 24 | Example messages: 25 | - Alex recounts a humorous attempt to explain his job as a developer to his grandparents, comparing it to magic. Keywords: "explaining job", "grandparents", "magic". 26 | - Rachel describes the moment her parents proudly told their friends she "fixes computers", simplifying her software development role. Keywords: "fixes computers", "parents", "software development". 27 | 28 | 5. The quest for the perfect IDE: Programmers debate the merits of various Integrated Development Environments. 29 | Example messages: 30 | - Tom advocates for VS Code, highlighting its extensions and community support. Keywords: "VS Code", "extensions", "community". 31 | - Naomi prefers JetBrains IDEs for their out-of-the-box experience and powerful refactoring tools. Keywords: "JetBrains", "out-of-the-box", "refactoring". 32 | 33 | Here's the JSON document: 34 | {text_to_summarize} 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Telegram Chat Summarizer App 2 | 3 | **Are you sick of skimming through tons of Telegram messages daily looking for the valuable info? The salvation is here!** 4 | 5 | This repository hosts an implementation of a Telegram application which monitors and summarizes group chats. Initially 6 | created for personal usage, it's intended for people who need to gather information from one or several live massive 7 | Telegram group chats which generate way too many messages to be reviewed manually. 8 | 9 | Based on the given configurations it: 10 | 11 | 1. Monitors a set of given Telegram group chats using [Telegram API](https://core.telegram.org/#telegram-api). 12 | 2. Summarizes the monitored chats over the defined lookback period (i.e. if you set the lookback period to 12 hours, 13 | you'll be receiving summaries for the last 12 hours twice a day). **Only text message are analyzed as of now.** 14 | 3. Sends the summaries to a given set of Telegram users using the [Bot API](https://core.telegram.org/#bot-api). 15 | 4. For each sent summary the app preserves the summarization context until the next summarization so that the user can 16 | ask clarifying questions on the summary. 17 | 18 | ## Installation 19 | 20 | 1. Obtain `api_id` and `api_hash` values for the Telegram API 21 | using [this](https://core.telegram.org/api/obtaining_api_id#obtaining-api-id) guide. 22 | 2. Create a Telegram bot and obtain its token 23 | using [this](https://core.telegram.org/bots/tutorial#obtain-your-bot-token) guide. 24 | 3. Obtain the OpenAI API key from [here](https://platform.openai.com/api-keys). For now the app has OpenAI backend 25 | hard-coded (`gpt-4-turbo-preview` model), but it's pretty easy to replace it with the backend of your choice as it's 26 | used via the [LangChain library](https://github.com/langchain-ai/langchain) calls. 27 | 4. Write a prompt for the chat summarization. Use the one in the `examples/` folder as a reference. 28 | 5. Define the configuration and save it to the `config.json` file: 29 | 30 | ```json 31 | { 32 | "telegram_api_id": , 33 | "telegram_api_hash": "", 34 | "openai_api_key": "", 35 | "telegram_bot_auth_token": "", 36 | "chats_to_summarize": [ 37 | { 38 | "id": "", 39 | "lookback_period_seconds": 86400, 40 | "summarization_prompt_path": "prompts/example_summarization_prompt.txt" 41 | } 42 | ], 43 | "telegram_summary_receivers": [ 44 | "" 45 | ] 46 | } 47 | ``` 48 | 49 | 5. Install Python requirements defined in the requirements.txt file or build the Docker image: 50 | 51 | ```shell 52 | python3 -m pip install -r requirements.txt 53 | ``` 54 | 55 | or 56 | 57 | ```shell 58 | docker build -t tcsa:latest . 59 | ``` 60 | 61 | 6. Run the app: 62 | 63 | ```shell 64 | python3 app.py config.json 65 | ``` 66 | 67 | or 68 | 69 | ```shell 70 | docker run -it tcsa:latest 71 | ``` 72 | 73 | At the first run the app will ask you to log in to the used Telegram account, like this: 74 | 75 | ```shell 76 | user@pc:~/telegram-chat-summarizer $ python3 app.py config.json 77 | 2024-03-27 23:03:11,618 - INFO - Started! 78 | Please enter your phone (or bot token): 79 | Please enter the code you received: 80 | Please enter your password: 81 | ``` 82 | 83 | Then the session will be stored on the disk, and the subsequent runs won't require authentication. 84 | 85 | ## Usage 86 | 87 | Once the app is up and running, each summary subscriber needs to send the `/verify` message to the bot so that it can 88 | register the user. 89 | 90 | The bot can switch conversation context by being provided with the command `/` (the chat name can 91 | be any of the ones defined in the config). This mechanism is used if you have more than one chat being summarized: by 92 | giving the corresponding command you can switch the LLM context to a different chat and discuss that chat's summary. 93 | 94 | ## Implementation details and limitations 95 | 96 | The implementation is very simplistic, and there is definitely a room for improvement. Some immediate nice-to-haves (PRs 97 | are 98 | welcome!): 99 | 100 | 1. Disk storage persistence for user verification and chat contexts. 101 | 2. Getting rid of the summary subscription mechanism in favor of something less inconvenient. 102 | 3. Basic app management (chats to summarize, summary subscribers etc.) through the bot as an alternative to the config 103 | file. 104 | 4. Supporting messages other than text ones: voice messages, images, videos. 105 | 106 | There is a [step-by-step guide on Habr (RU)](https://habr.com/ru/articles/804111/) written after this implementation. 107 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import threading 3 | from typing import List, Union 4 | from collections import defaultdict 5 | import logging 6 | import schedule 7 | import time 8 | import json 9 | from pydantic import BaseModel, Field 10 | 11 | from communication import GroupChatScrapper, EnvoyBot 12 | from summarization import Summarizer 13 | 14 | 15 | class SummarizationConfig(BaseModel): 16 | id: Union[str, int] 17 | lookback_period_seconds: int 18 | summarization_prompt_path: str 19 | 20 | 21 | class AppConfig(BaseModel): 22 | log_level: str = Field(default="INFO") 23 | telegram_api_id: int 24 | telegram_api_hash: str 25 | telegram_bot_auth_token: str 26 | openai_api_key: str 27 | chats_to_summarize: List[SummarizationConfig] 28 | telegram_summary_receivers: List[str] 29 | 30 | 31 | if __name__ == "__main__": 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("path_to_config") 34 | args = parser.parse_args() 35 | 36 | with open(args.path_to_config, "r") as f: 37 | app_config = AppConfig.model_validate_json(f.read()) 38 | 39 | # Validate user prompts 40 | for c in app_config.chats_to_summarize: 41 | with open(c.summarization_prompt_path, "r") as f: 42 | Summarizer.validate_summarization_prompt(f.read()) 43 | 44 | # Initialize logger 45 | logger = logging.getLogger("CSB") 46 | handler = logging.StreamHandler() 47 | handler.setFormatter(logging.Formatter( 48 | '%(asctime)s - %(levelname)s - %(message)s')) 49 | logger.addHandler(handler) 50 | logger.setLevel(app_config.log_level) 51 | logger.info("Started!") 52 | 53 | # Declare global LLM context storage 54 | llm_contexts = defaultdict(dict) 55 | llm_contexts_lock = threading.Lock() 56 | 57 | 58 | def chat_callback(input_message_text, sender, context_name, send_message_func): 59 | with llm_contexts_lock: 60 | envoy_bot.set_typing_status([sender], llm_contexts_lock.locked) 61 | if not context_name in llm_contexts or not sender in llm_contexts[context_name]: 62 | send_message_func(f"No context is available for {context_name} yet") 63 | return 64 | logger.info(f"Chatting with: {sender}") 65 | response = llm_contexts[context_name][sender].predict(human_input=input_message_text) 66 | logger.debug(f"Response to message \"{input_message_text}\" from {sender}: \"{response}\"") 67 | send_message_func(response) 68 | 69 | 70 | summarizer = Summarizer(app_config.openai_api_key) 71 | group_chat_scrapper = GroupChatScrapper(app_config.telegram_api_id, app_config.telegram_api_hash) 72 | envoy_bot = EnvoyBot( 73 | app_config.telegram_bot_auth_token, 74 | app_config.telegram_summary_receivers, 75 | [c.id for c in app_config.chats_to_summarize], 76 | chat_callback 77 | ) 78 | 79 | 80 | def summarization_job(chat_cfg, summarization_prompt, summary_receivers): 81 | logger.info(f"Running summarization job for: {chat_cfg.id}") 82 | with llm_contexts_lock: 83 | # Set the "typing" status for the bot 84 | envoy_bot.set_typing_status(summary_receivers, llm_contexts_lock.locked) 85 | 86 | # Scrap messages for the given chat 87 | messages, chat_title= group_chat_scrapper.get_message_history(chat_cfg.id, chat_cfg.lookback_period_seconds) 88 | logger.debug( 89 | f"Scrapped {len(messages)} messages for {chat_cfg.id} over the last {chat_cfg.lookback_period_seconds} seconds") 90 | serialized_messages = json.dumps({"messages": messages}, ensure_ascii=False) 91 | 92 | # Summarize messages 93 | summary, context = summarizer.summarize(serialized_messages, summarization_prompt) 94 | 95 | # Send the summary and update LLM context 96 | for u in summary_receivers: 97 | llm_contexts[chat_cfg.id][u] = context 98 | logger.info(f"Sending summary for {chat_cfg.id} to {u}") 99 | logger.debug(f"Summary for {chat_title}: {summary}") 100 | chat_lookback_period_hours = int(chat_cfg.lookback_period_seconds / 60 / 60) 101 | envoy_bot.send_summary( 102 | u, 103 | f"Summary for {chat_cfg.id} for the last {chat_lookback_period_hours} hours:\n\n{summary}", 104 | chat_cfg.id 105 | ) 106 | 107 | 108 | # Setup recurring summarization jobs 109 | for chat_config in app_config.chats_to_summarize: 110 | with open(chat_config.summarization_prompt_path, "r") as f: 111 | chat_summarization_prompt = f.read() 112 | schedule.every(chat_config.lookback_period_seconds).seconds.do( 113 | job_func=summarization_job, 114 | chat_cfg=chat_config, 115 | summarization_prompt=chat_summarization_prompt, 116 | summary_receivers=app_config.telegram_summary_receivers 117 | ) 118 | 119 | # Run the jobs for the first time 120 | schedule.run_all() 121 | while True: 122 | schedule.run_pending() 123 | time.sleep(1) 124 | -------------------------------------------------------------------------------- /communication.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | from datetime import datetime, timedelta, timezone 3 | import threading 4 | import time 5 | import logging 6 | from telethon.sync import TelegramClient 7 | from telethon.tl.types import User, Channel 8 | import telebot 9 | 10 | 11 | class GroupChatScrapper: 12 | def __init__(self, telegram_api_id, telegram_api_hash): 13 | self.logger = logging.getLogger("CSB") 14 | # Here we are forced to use the Telegram API because bots cannot be added to group chats by anyone except admins 15 | self.client = TelegramClient("CSB", api_id=telegram_api_id, api_hash=telegram_api_hash) 16 | self.client.start() 17 | # We need to always disconnect not to break the Telegram session 18 | atexit.register(self.client.disconnect) 19 | 20 | @staticmethod 21 | def get_telegram_user_name(sender): 22 | if type(sender) is User: 23 | if sender.first_name and sender.last_name: 24 | return sender.first_name + " " + sender.last_name 25 | elif sender.first_name: 26 | return sender.first_name 27 | elif sender.last_name: 28 | return sender.last_name 29 | else: 30 | return "" 31 | else: 32 | if type(sender) is Channel: 33 | return sender.title 34 | 35 | @staticmethod 36 | def get_datetime_from(lookback_period): 37 | return (datetime.utcnow() - timedelta(seconds=lookback_period)).replace(tzinfo=timezone.utc) 38 | 39 | def get_message_history(self, chat_id, lookback_period): 40 | history = [] 41 | datetime_from = self.get_datetime_from(lookback_period) 42 | # Warning: this probably won't work with the private group chats as those require joining beforehand 43 | # (public chats can be scrapped right away) 44 | for message in self.client.iter_messages(chat_id): 45 | if message.date < datetime_from: 46 | break 47 | if not message.text: 48 | logging.warning(f"Non-text message skipped, summarization result might be affected") 49 | continue 50 | sender = message.get_sender() 51 | data = { 52 | "id": message.id, 53 | "datetime": str(message.date), 54 | "text": message.text, 55 | "sender_user_name": self.get_telegram_user_name(sender), 56 | "sender_user_id": sender.id, 57 | "is_reply": message.is_reply 58 | } 59 | if message.is_reply: 60 | data["reply_to_message_id"] = message.reply_to.reply_to_msg_id 61 | history.append(data) 62 | chat_title = self.client.get_entity(chat_id).title 63 | return list(reversed(history)), chat_title 64 | 65 | 66 | class EnvoyBot: 67 | def __init__(self, telegram_bot_auth_token, telegram_summary_receivers, allowed_contexts, chat_callback): 68 | self.logger = logging.getLogger("CSB") 69 | self.telegram_summary_receivers = telegram_summary_receivers 70 | self.verified_receivers = dict() 71 | 72 | # This one is used for switching between summarized chat conversation 73 | self.allowed_commands = ["/" + c for c in allowed_contexts] 74 | self.current_user_contexts = dict() 75 | 76 | # This one is used to generate responses for arbitrary messages 77 | self.chat_callback = chat_callback 78 | 79 | # The bot is running in the background thread to make the call non-blocking 80 | self.bot = telebot.TeleBot(telegram_bot_auth_token) 81 | self.bot.set_update_listener(self.__handle_messages) 82 | self.bot_thread = threading.Thread(target=self.bot.infinity_polling) 83 | self.bot_thread.start() 84 | 85 | def send_summary(self, username, text, chat_id): 86 | if not username in self.verified_receivers: 87 | self.logger.info(f"User {username} is not yet verified") 88 | return 89 | self.bot.send_message(self.verified_receivers[username], text, parse_mode="HTML") 90 | self.set_current_user_context(username, chat_id) 91 | 92 | def set_typing_status(self, users, predicate): 93 | # The self self.bot.send_chat_action(user, "typing") sets the status for <= 5 seconds until the message is sent 94 | # We use this kludge to make the status persistent for a longer time 95 | def f(): 96 | while predicate(): 97 | for u in users: 98 | if u in self.verified_receivers: 99 | self.bot.send_chat_action(self.verified_receivers[u], "typing") 100 | time.sleep(5) 101 | 102 | threading.Thread(target=f).start() 103 | 104 | def set_current_user_context(self, username, context): 105 | self.current_user_contexts[username] = context 106 | 107 | def __handle_messages(self, messages): 108 | for message in messages: 109 | if not message.text: 110 | return 111 | sender = message.from_user.username 112 | if not sender or not sender in self.telegram_summary_receivers: 113 | self.logger.warning(f"Unauthorized usage attempt from user: {str(message.from_user)}") 114 | return 115 | if message.text.startswith("/"): 116 | if message.text == "/verify": 117 | # We need this verification because bots cannot retrieve chat IDs by the username 118 | self.verified_receivers[sender] = message.chat.id 119 | self.bot.send_message(message.chat.id, "You are now verified and will receive generated summaries") 120 | return 121 | else: 122 | if not message.text in self.allowed_commands: 123 | self.bot.send_message(message.chat.id, 124 | "Invalid command, valid commands are: " + ", ".join( 125 | self.allowed_commands)) 126 | return 127 | self.set_current_user_context(sender, message.text[1:]) 128 | self.bot.send_message(message.chat.id, f"Switched context to {self.current_user_contexts[sender]}") 129 | else: 130 | if not sender in self.current_user_contexts: 131 | self.bot.send_message(message.chat.id, 132 | "Select context first, valid commands are: " + ", ".join( 133 | self.allowed_commands)) 134 | return 135 | self.chat_callback(message.text, sender, self.current_user_contexts[sender], 136 | lambda x: self.bot.send_message(message.chat.id, x)) 137 | --------------------------------------------------------------------------------