├── requirements.txt ├── config.json ├── LICENSE └── telehunting.py /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==2.1.4+dfsg 2 | nltk==3.9.1 3 | telethon==1.29.2 4 | colorama==0.4.5 5 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "initial_channel_links": [ 3 | "https://t.me/", 4 | "https://t.me/", 5 | "https://t.me/" 6 | ], 7 | "message_keywords": ["hack", "carding", "malware", "exploit", "cracking"], 8 | "batch_size": 100 9 | } 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Flare Systems 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /telehunting.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import re 3 | import pandas as pd 4 | import nltk 5 | from nltk.sentiment import SentimentIntensityAnalyzer 6 | from telethon.sync import TelegramClient 7 | from telethon.tl.functions.channels import JoinChannelRequest 8 | from telethon.errors import FloodWaitError, ChannelPrivateError 9 | from telethon.tl.types import Channel, User, Channel, Chat 10 | import multiprocessing 11 | from functools import partial 12 | import argparse 13 | import json 14 | import random 15 | import signal 16 | import os 17 | from datetime import datetime 18 | from colorama import init, Fore, Back, Style 19 | 20 | init(autoreset=True) 21 | 22 | PURPLE_BLUE = '\033[38;2;100;100;255m' 23 | LIGHT_PURPLE = '\033[38;2;200;180;255m' 24 | BOLD_WHITE = '\033[1;37m' 25 | 26 | def print_info(message): 27 | print(f"{PURPLE_BLUE}ℹ {BOLD_WHITE}{message}") 28 | 29 | def print_success(message): 30 | print(f"{LIGHT_PURPLE}✔ {BOLD_WHITE}{message}") 31 | 32 | def print_warning(message): 33 | print(f"{Fore.YELLOW}{Style.BRIGHT}⚠ {BOLD_WHITE}{message}") 34 | 35 | def print_error(message): 36 | print(f"{Fore.RED}✘ {message}") 37 | 38 | def print_header(message): 39 | print(f"\n{PURPLE_BLUE}{Style.BRIGHT}{message}") 40 | print(f"{PURPLE_BLUE}{'-' * len(message)}{Style.RESET_ALL}") 41 | 42 | def print_subheader(message): 43 | print(f"\n{LIGHT_PURPLE}{Style.BRIGHT}{message}") 44 | print(f"{LIGHT_PURPLE}{'-' * len(message)}{Style.RESET_ALL}") 45 | 46 | def banner(): 47 | print(f""" 48 | 49 | {Fore.BLUE}{Style.BRIGHT} 50 | 51 | 52 | +++++ 53 | ++{LIGHT_PURPLE}= +{Style.RESET_ALL}{Fore.BLUE}{Style.BRIGHT}+ 54 | ++{LIGHT_PURPLE}+ ++{Style.RESET_ALL}{Fore.BLUE}{Style.BRIGHT}+ 55 | +++{LIGHT_PURPLE}+++{Style.RESET_ALL}{Fore.BLUE}{Style.BRIGHT}++* 56 | *+++*+*** 57 | ******** 58 | {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}********** 59 | **{LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT} ********* 60 | ***{LIGHT_PURPLE}##{Fore.BLUE}{Style.BRIGHT}********** 61 | *****{LIGHT_PURPLE}###{Fore.BLUE}{Style.BRIGHT}***********{LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT} 62 | *********{LIGHT_PURPLE}####{Fore.BLUE} ******{LIGHT_PURPLE}########{Fore.BLUE}{Style.BRIGHT} 63 | ++{LIGHT_PURPLE}+{Fore.BLUE}{Style.BRIGHT}++**************{LIGHT_PURPLE}### #######{Fore.BLUE}{Style.BRIGHT} *******++{LIGHT_PURPLE}++{Fore.BLUE}{Style.BRIGHT}++ 64 | +{LIGHT_PURPLE}++ +{Fore.BLUE}{Style.BRIGHT}**************{LIGHT_PURPLE}# ##{Fore.BLUE}{Style.BRIGHT} ************* +{LIGHT_PURPLE}{Fore.BLUE}{Style.BRIGHT}++ 65 | ++{LIGHT_PURPLE}+ +{Fore.BLUE}{Style.BRIGHT}*********** {LIGHT_PURPLE}# #{Fore.BLUE}{Style.BRIGHT}*************+* +{LIGHT_PURPLE}{Fore.BLUE}{Style.BRIGHT}++ 66 | +++{LIGHT_PURPLE}++{Fore.BLUE}{Style.BRIGHT}******** {LIGHT_PURPLE}######## ###{Fore.BLUE}{Style.BRIGHT}*************++{LIGHT_PURPLE}++{Fore.BLUE}{Style.BRIGHT}++ 67 | {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}**{LIGHT_PURPLE}####{Fore.BLUE}{Style.BRIGHT}****** {LIGHT_PURPLE}###{Fore.BLUE}{Style.BRIGHT}*********** 68 | ************{LIGHT_PURPLE}###{Fore.BLUE}{Style.BRIGHT}***** 69 | **********{LIGHT_PURPLE}##{Fore.BLUE}{Style.BRIGHT}*** 70 | ********* {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}** 71 | ********* * 72 | ******** {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT} 73 | ********* 74 | **+{LIGHT_PURPLE}**{Fore.BLUE}{Style.BRIGHT}+*** 75 | *+{LIGHT_PURPLE}+ +{Fore.BLUE}{Style.BRIGHT}++ 76 | +{LIGHT_PURPLE}+ +{Fore.BLUE}{Style.BRIGHT}++ 77 | ++{LIGHT_PURPLE}+{Fore.BLUE}{Style.BRIGHT}++ 78 | 79 | 80 | 81 | 82 | 83 | {Style.RESET_ALL} 84 | """) 85 | 86 | # Ensure NLTK data is downloaded 87 | def ensure_nltk_data(): 88 | try: 89 | nltk.data.find('tokenizers/punkt') 90 | nltk.data.find('sentiment/vader_lexicon.zip') 91 | except LookupError: 92 | print_info("Downloading NLTK data...") 93 | nltk.download('punkt', quiet=True) 94 | nltk.download('vader_lexicon', quiet=True) 95 | 96 | # Extract Telegram channel links from messages 97 | def extract_channel_links(text): 98 | if not text or not isinstance(text, str): 99 | return [] 100 | pattern = r't\.me/(?:joinchat/)?[a-zA-Z0-9_-]+' 101 | return re.findall(pattern, text) 102 | 103 | # Clean and format channel links 104 | def clean_link(link): 105 | if not link or not isinstance(link, str): 106 | return None 107 | 108 | link = link.split(')')[0].strip() 109 | 110 | if re.match(r'^[a-zA-Z0-9_]{5,}$', link): 111 | return link 112 | 113 | match = re.search(r't\.me/(?:joinchat/)?([a-zA-Z0-9_-]+)', link) 114 | if match: 115 | username_or_hash = match.group(1) 116 | 117 | if 'joinchat' in link: 118 | return f'https://t.me/joinchat/{username_or_hash}' 119 | 120 | return username_or_hash 121 | 122 | return None 123 | 124 | # Manage discovered channels 125 | class ChannelManager: 126 | def __init__(self): 127 | self.discovered_channels = set() 128 | self.joined_channels = set() 129 | self.processed_channels = set() 130 | self.channel_affiliations = {} 131 | self.initial_channels = set() 132 | 133 | def add_channel(self, link, source_channel=None): 134 | cleaned_link = clean_link(link) 135 | if cleaned_link and cleaned_link not in self.joined_channels and cleaned_link not in self.processed_channels: 136 | self.discovered_channels.add(cleaned_link) 137 | if source_channel: 138 | self.channel_affiliations[cleaned_link] = source_channel 139 | else: 140 | self.initial_channels.add(cleaned_link) # Mark as initial channel if no source 141 | 142 | def mark_as_joined(self, link): 143 | cleaned_link = clean_link(link) 144 | if cleaned_link: 145 | self.joined_channels.add(cleaned_link) 146 | self.discovered_channels.discard(cleaned_link) 147 | 148 | def mark_as_processed(self, link): 149 | cleaned_link = clean_link(link) 150 | if cleaned_link: 151 | self.processed_channels.add(cleaned_link) 152 | self.discovered_channels.discard(cleaned_link) 153 | 154 | def has_unprocessed_channels(self): 155 | return len(self.discovered_channels) > 0 156 | 157 | def get_next_channel(self): 158 | if self.discovered_channels: 159 | return self.discovered_channels.pop() 160 | return None 161 | 162 | def get_affiliation(self, link): 163 | cleaned_link = clean_link(link) 164 | return self.channel_affiliations.get(cleaned_link, None) 165 | 166 | def display_status(self): 167 | print_subheader("Channel Status") 168 | print(f" Channels waiting to be processed: {len(self.discovered_channels)}") 169 | print(f" Channels joined: {len(self.joined_channels)}") 170 | print(f" Channels processed: {len(self.processed_channels)}") 171 | 172 | # Join channel by url 173 | async def join_channel(client, channel_manager, link, max_retries=3): 174 | cleaned_link = clean_link(link) 175 | if not cleaned_link: 176 | print_warning(f"Invalid link format: {link}") 177 | return False 178 | 179 | retries = 0 180 | while retries < max_retries: 181 | try: 182 | entity = await client.get_entity(cleaned_link) 183 | entity_name = await get_entity_name(entity) 184 | 185 | if isinstance(entity, (Channel, Chat)): 186 | if entity.username: 187 | await client(JoinChannelRequest(entity)) 188 | else: 189 | print_warning(f"Cannot join private channel {entity_name} without an invite link") 190 | return False 191 | elif isinstance(entity, User): 192 | print_info(f"Entity {entity_name} is a user, no need to join") 193 | else: 194 | print_warning(f"Unknown entity type for {entity_name}") 195 | return False 196 | 197 | print_success(f"Successfully processed entity: {entity_name}") 198 | channel_manager.mark_as_joined(cleaned_link) 199 | return True 200 | 201 | except FloodWaitError as e: 202 | wait_time = min(e.seconds, 30) 203 | print_warning(f"FloodWaitError encountered. Waiting for {wait_time} seconds. (Attempt {retries + 1}/{max_retries})") 204 | await asyncio.sleep(wait_time) 205 | except Exception as e: 206 | print_error(f"Failed to process entity {cleaned_link}: {e}") 207 | 208 | retries += 1 209 | await asyncio.sleep(1) 210 | 211 | print_warning(f"Max retries exceeded. Failed to process entity: {cleaned_link}") 212 | return False 213 | 214 | # Load configuration 215 | def load_config(config_path): 216 | if os.path.exists(config_path): 217 | with open(config_path, 'r') as f: 218 | return json.load(f) 219 | return None 220 | 221 | # Create a default config file, if no config present (providing one anyways for clarity sake) 222 | def create_default_config(config_path): 223 | default_config = { 224 | "initial_channel_links": [], 225 | "message_keywords": [], 226 | "batch_size": 100 227 | } 228 | with open(config_path, 'w') as f: 229 | json.dump(default_config, f, indent=4) 230 | print_success(f"Default config file created at {config_path}") 231 | print_info("Please edit this file with your channel links and keywords.") 232 | return default_config 233 | 234 | # Home made sentiment lexicon (this is my first time doing this, it may suck) 235 | class CybersecuritySentimentAnalyzer: 236 | def __init__(self): 237 | self.sia = SentimentIntensityAnalyzer() 238 | self.cybersecurity_lexicon = { 239 | 'vulnerability': 2.0, 240 | 'exploit': -3.0, 241 | 'patch': 2.0, 242 | 'hack': -2.0, 243 | 'secure': 3.0, 244 | 'breach': -4.0, 245 | 'protect': 3.0, 246 | 'malware': -3.0, 247 | 'ransomware': -4.0, 248 | 'encryption': 2.0, 249 | 'backdoor': -3.0, 250 | 'firewall': 2.0, 251 | 'phishing': -3.0, 252 | 'authentication': 2.0, 253 | 'threat': -2.0, 254 | 'zero-day': -4.0, 255 | 'security': 1.0, 256 | 'attack': -2.0, 257 | 'defense': 2.0, 258 | 'compromise': -3.0 259 | } 260 | self.sia.lexicon.update(self.cybersecurity_lexicon) 261 | 262 | def polarity_scores(self, text): 263 | return self.sia.polarity_scores(text) 264 | 265 | # Global variables 266 | current_batch = [] 267 | batch_counter = 1 268 | 269 | # keyboard interrupt (Ctrl+C) 270 | def signal_handler(sig, frame): 271 | global current_batch, batch_counter 272 | print_warning(f"\nKeyboard interrupt received. Saving current batch and exiting...") 273 | save_current_batch(current_batch, batch_counter) 274 | exit(0) 275 | 276 | # Save current batch to CSV 277 | def save_current_batch(batch, batch_counter): 278 | if batch: 279 | df = pd.DataFrame(batch, columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment']) 280 | 281 | # If sentiment analysis hasn't been done, do it now 282 | if df['Sentiment'].isnull().all(): 283 | cybersecurity_sia = CybersecuritySentimentAnalyzer() 284 | df['Sentiment'] = df['Message'].apply(cybersecurity_sia.polarity_scores) 285 | df['Compound_Sentiment'] = df['Sentiment'].apply(lambda x: x['compound'] if isinstance(x, dict) else None) 286 | 287 | batch_filename = f"telegram_scraped_messages_batch_{batch_counter}.csv" 288 | df.to_csv(batch_filename, index=False) 289 | print_success(f"Saved batch {batch_counter} with {len(batch)} messages to {batch_filename}") 290 | else: 291 | print_info(f"No messages in the current batch.") 292 | 293 | # generate sentiment report 294 | def generate_sentiment_report(df): 295 | try: 296 | # Ensure Compound_Sentiment is float 297 | df['Compound_Sentiment'] = pd.to_numeric(df['Compound_Sentiment'], errors='coerce') 298 | 299 | # Calculate average sentiment scores 300 | avg_sentiment = pd.DataFrame(df['Sentiment'].dropna().tolist()).mean() 301 | 302 | # Categorise messages based on compound sentiment 303 | df['Sentiment_Category'] = df['Compound_Sentiment'].apply(lambda x: 304 | 'High Alert' if x <= -0.5 else 305 | 'Potential Threat' if -0.5 < x <= -0.1 else 306 | 'Neutral' if -0.1 < x < 0.1 else 307 | 'Potentially Positive' if 0.1 <= x < 0.5 else 308 | 'Very Positive' 309 | ) 310 | sentiment_counts = df['Sentiment_Category'].value_counts() 311 | total_messages = len(df) 312 | 313 | # Calculate overall sentiment score 314 | overall_score = avg_sentiment.get('compound', 0) * 100 315 | 316 | report = f""" 317 | Sentiment Analysis Report 318 | {'-' * 50} 319 | Total messages analyzed: {total_messages} 320 | 321 | Overall Sentiment Score: {overall_score:.1f}/100 322 | Interpretation: 323 | {interpret_overall_score(overall_score)} 324 | 325 | Message Sentiment Breakdown: 326 | """ 327 | 328 | categories = [ 329 | ('High Alert', "Severe Threats"), 330 | ('Potential Threat', "Potential Threats"), 331 | ('Neutral', "Neutral Messages"), 332 | ('Potentially Positive', "Potentially Positive"), 333 | ('Very Positive', "Strong Security Indicators") 334 | ] 335 | 336 | for category, description in categories: 337 | count = sentiment_counts.get(category, 0) 338 | percentage = (count / total_messages) * 100 339 | report += f"{category} ({description}): {count} messages ({percentage:.1f}%)\n" 340 | 341 | report += f"\nTop 5 Most Concerning Messages (Potential Threats):\n" 342 | 343 | for _, row in df.nsmallest(5, 'Compound_Sentiment').iterrows(): 344 | threat_level = abs(row['Compound_Sentiment']) * 100 345 | report += f"- {row['Message'][:100]}... (Threat Level: {threat_level:.1f}/100)\n" 346 | 347 | report += f"\nTop 5 Most Positive Messages (Potential Security Improvements):\n" 348 | 349 | for _, row in df.nlargest(5, 'Compound_Sentiment').iterrows(): 350 | positivity_level = row['Compound_Sentiment'] * 100 351 | report += f"- {row['Message'][:100]}... (Positivity Level: {positivity_level:.1f}/100)\n" 352 | 353 | with open('sentiment_report.txt', 'w', encoding='utf-8') as f: 354 | f.write(report) 355 | 356 | print_success("Sentiment analysis report generated and saved to 'sentiment_report.txt'") 357 | 358 | # Print the sentiment category counts to the console with colors 359 | print_info("Sentiment Category Counts:") 360 | for category, description in categories: 361 | count = sentiment_counts.get(category, 0) 362 | percentage = (count / total_messages) * 100 363 | color = get_category_color(category) 364 | print(f"{color}{category}: {count} ({percentage:.1f}%){Style.RESET_ALL}") 365 | 366 | except Exception as e: 367 | print_error(f"Error generating sentiment report: {e}") 368 | print_error(f"DataFrame info:\n{df.info()}") 369 | 370 | def get_category_color(category): 371 | color_map = { 372 | 'High Alert': Fore.RED, 373 | 'Potential Threat': Fore.YELLOW, 374 | 'Neutral': Fore.WHITE, 375 | 'Potentially Positive': Fore.LIGHTGREEN_EX, 376 | 'Very Positive': Fore.GREEN 377 | } 378 | return color_map.get(category, '') 379 | 380 | def interpret_overall_score(score): 381 | if score <= -50: 382 | return "Critical situation. Numerous severe threats detected. Immediate action required." 383 | elif -50 < score <= -10: 384 | return "Concerning situation. Multiple potential threats identified. Heightened vigilance needed." 385 | elif -10 < score < 10: 386 | return "Neutral situation. No significant threats or improvements detected. Maintain standard security measures." 387 | elif 10 <= score < 50: 388 | return "Positive situation. Some potential security improvements identified. Consider implementing suggested measures." 389 | else: 390 | return "Very positive situation. Strong security indicators present. Continue current security practices and look for areas of improvement." 391 | 392 | def analyze_sentiment(cybersecurity_sia, message): 393 | return cybersecurity_sia.polarity_scores(message) 394 | 395 | def process_messages(messages, num_processes=multiprocessing.cpu_count()): 396 | df = pd.DataFrame(messages, columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment']) 397 | 398 | cybersecurity_sia = CybersecuritySentimentAnalyzer() 399 | 400 | # Parallelize sentiment analysis 401 | with multiprocessing.Pool(processes=num_processes) as pool: 402 | partial_analyze = partial(analyze_sentiment, cybersecurity_sia) 403 | df['Sentiment'] = pool.map(partial_analyze, df['Message']) 404 | 405 | df['Compound_Sentiment'] = df['Sentiment'].apply(lambda x: x['compound']) 406 | 407 | generate_sentiment_report(df) 408 | return df 409 | 410 | async def get_entity_name(entity): 411 | if isinstance(entity, User): 412 | return f"@{entity.username}" if entity.username else f"User({entity.id})" 413 | elif isinstance(entity, (Channel, Chat)): 414 | return entity.title or f"Channel({entity.id})" 415 | else: 416 | return f"Unknown({type(entity).__name__})" 417 | 418 | async def scrape_messages(client, entity, message_limit, keywords, channel_manager, affiliated_channel=None): 419 | messages = [] 420 | try: 421 | entity_name = await get_entity_name(entity) 422 | async for message in client.iter_messages(entity, limit=message_limit): 423 | if message.text: 424 | if affiliated_channel: 425 | print_info(f"Message from {Fore.CYAN}{Style.BRIGHT}{entity_name}{Style.RESET_ALL}.{Fore.YELLOW}{Style.BRIGHT} <-- {affiliated_channel}{Style.RESET_ALL}: {message.text}") 426 | else: 427 | print_info(f"Message from {Fore.CYAN}{Style.BRIGHT}{entity_name}{Style.RESET_ALL}: {message.text}") 428 | messages.append([message.sender_id, message.date, message.text, None, None]) 429 | 430 | # Process t.me links in the message 431 | links = extract_channel_links(message.text) 432 | for link in links: 433 | channel_manager.add_channel(link, source_channel=entity_name) 434 | 435 | await asyncio.sleep(0.1) 436 | except FloodWaitError as e: 437 | print_warning(f"FloodWaitError in scrape_messages: {e}") 438 | await asyncio.sleep(min(e.seconds, 30)) 439 | except Exception as e: 440 | print_error(f"Error scraping entity {entity_name}: {e}") 441 | 442 | return messages, entity_name 443 | 444 | async def process_channels(client, channel_manager, message_depth, keywords, batch_processor): 445 | while channel_manager.has_unprocessed_channels(): 446 | link = channel_manager.get_next_channel() 447 | affiliated_channel = channel_manager.get_affiliation(link) 448 | try: 449 | join_success = await retry_with_backoff(join_channel(client, channel_manager, link)) 450 | if join_success: 451 | entity = await client.get_entity(link) 452 | entity_messages, channel_name = await scrape_messages(client, entity, message_depth, keywords, channel_manager, affiliated_channel) 453 | 454 | # Add messages to batch processor with channel name and affiliation 455 | batch_processor.add_messages(entity_messages, channel_name, affiliated_channel) 456 | else: 457 | print_warning(f"Skipping entity {link} due to joining failure") 458 | except Exception as e: 459 | print_error(f"Failed to process entity {link}: {e}") 460 | finally: 461 | channel_manager.mark_as_processed(link) 462 | 463 | await asyncio.sleep(1) # Small delay between processing channels 464 | 465 | async def process_single_channel(client, channel_manager, link, message_depth, keywords): 466 | try: 467 | join_success = await retry_with_backoff(join_channel(client, channel_manager, link)) 468 | if join_success: 469 | entity = await client.get_entity(link) 470 | entity_name = await get_entity_name(entity) 471 | print_info(f"Scraping messages from: {entity_name}") 472 | entity_messages = await scrape_messages(client, entity, message_depth, keywords, channel_manager) 473 | return entity_messages 474 | else: 475 | print_warning(f"Skipping entity {link} due to joining failure") 476 | except Exception as e: 477 | print_error(f"Failed to process entity {link}: {e}") 478 | return [] 479 | 480 | async def retry_with_backoff(coroutine, max_retries=5, base_delay=1, max_delay=60): 481 | retries = 0 482 | while True: 483 | try: 484 | return await coroutine 485 | except FloodWaitError as e: 486 | if retries >= max_retries: 487 | raise 488 | delay = min(base_delay * (2 ** retries) + random.uniform(0, 1), max_delay) 489 | print_warning(f"FloodWaitError encountered. Retrying in {delay:.2f} seconds. (Attempt {retries + 1}/{max_retries})") 490 | await asyncio.sleep(delay) 491 | retries += 1 492 | except Exception as e: 493 | print_error(f"Unexpected error: {e}") 494 | raise 495 | 496 | 497 | 498 | class BatchProcessor: 499 | def __init__(self, batch_size=1000, cybersecurity_sia=None): 500 | self.batch = [] 501 | self.batch_size = batch_size 502 | self.batch_counter = 1 503 | self.total_messages = 0 504 | self.cybersecurity_sia = cybersecurity_sia or CybersecuritySentimentAnalyzer() 505 | self.all_messages_df = pd.DataFrame(columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment', 'Channel Name', 'Affiliated Channel']) 506 | 507 | def add_messages(self, messages, channel_name, affiliated_channel): 508 | messages_with_info = [ 509 | message + [channel_name, affiliated_channel if affiliated_channel else "Initial Config"] 510 | for message in messages 511 | ] 512 | self.batch.extend(messages_with_info) 513 | self.total_messages += len(messages) 514 | if len(self.batch) >= self.batch_size: 515 | self.save_batch() 516 | 517 | def save_batch(self): 518 | if self.batch: 519 | df = pd.DataFrame(self.batch, columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment', 'Channel Name', 'Affiliated Channel']) 520 | df['Sentiment'] = df['Message'].apply(self.cybersecurity_sia.polarity_scores) 521 | df['Compound_Sentiment'] = df['Sentiment'].apply(lambda x: x['compound']).astype(float) 522 | 523 | batch_filename = f"telegram_scraped_messages_batch_{self.batch_counter}.csv" 524 | df.to_csv(batch_filename, index=False) 525 | print_success(f"Saved batch {self.batch_counter} with {len(self.batch)} messages to {batch_filename}") 526 | 527 | # Ensure consistent dtypes 528 | for col in df.columns: 529 | if col in self.all_messages_df.columns: 530 | df[col] = df[col].astype(self.all_messages_df[col].dtype) 531 | 532 | self.all_messages_df = pd.concat([self.all_messages_df, df], ignore_index=True) 533 | 534 | self.batch = [] 535 | self.batch_counter += 1 536 | 537 | def generate_final_report(self): 538 | print_info(f"Generating final report. Total messages: {len(self.all_messages_df)}") 539 | 540 | if self.all_messages_df.empty: 541 | print_warning("No messages to generate report from.") 542 | return 543 | 544 | generate_sentiment_report(self.all_messages_df) 545 | 546 | def finalize(self): 547 | self.save_batch() # Save any remaining messages 548 | self.generate_final_report() 549 | 550 | def __del__(self): 551 | self.save_batch() # Save any remaining messages when the object is destroyed 552 | 553 | # pretty much our main func at this point 554 | async def run_scraper(config, message_depth, channel_depth): 555 | await client.start() 556 | 557 | signal.signal(signal.SIGINT, signal_handler) 558 | 559 | try: 560 | channel_manager = ChannelManager() 561 | cybersecurity_sia = CybersecuritySentimentAnalyzer() 562 | batch_processor = BatchProcessor(cybersecurity_sia=cybersecurity_sia) 563 | 564 | # Add initial channels from config 565 | for link in config['initial_channel_links']: 566 | channel_manager.add_channel(link) 567 | 568 | start_time = datetime.now() 569 | print_header(f"Scraping started at {start_time}") 570 | 571 | depth = 0 572 | while channel_manager.has_unprocessed_channels() and depth < channel_depth: 573 | print_subheader(f"Crawling at depth {depth + 1}/{channel_depth}") 574 | channel_manager.display_status() 575 | 576 | await process_channels(client, channel_manager, message_depth, config['message_keywords'], batch_processor) 577 | 578 | depth += 1 579 | 580 | # Allow time for rate limiting 581 | await asyncio.sleep(5) 582 | 583 | end_time = datetime.now() 584 | duration = end_time - start_time 585 | print_header(f"Scraping completed at {end_time}") 586 | print_info(f"Total duration: {duration}") 587 | print_info(f"Total messages scraped: {batch_processor.total_messages}") 588 | print_info(f"Total channels processed: {len(channel_manager.processed_channels)}") 589 | 590 | # Finalize batch processing and generate report 591 | batch_processor.finalize() 592 | 593 | except Exception as e: 594 | print_error(f"An error occurred during scraping: {e}") 595 | finally: 596 | await client.disconnect() 597 | 598 | async def process_all_channels(client, channel_manager, message_depth, keywords): 599 | all_messages = [] 600 | channels_to_process = list(channel_manager.discovered_channels) 601 | 602 | for link in channels_to_process: 603 | try: 604 | join_success = await retry_with_backoff(join_channel(client, channel_manager, link)) 605 | if join_success: 606 | entity = await client.get_entity(link) 607 | entity_name = await get_entity_name(entity) 608 | print_info(f"Scraping messages from: {entity_name}") 609 | entity_messages = await scrape_messages(client, entity, message_depth, keywords, channel_manager) 610 | all_messages.extend(entity_messages) 611 | 612 | # Process newly discovered channels 613 | new_channels = channel_manager.get_new_channels() 614 | for new_link in new_channels: 615 | channel_manager.add_channel(new_link) 616 | else: 617 | print_warning(f"Skipping entity {link} due to joining failure") 618 | except Exception as e: 619 | print_error(f"Failed to process entity {link}: {e}") 620 | 621 | await asyncio.sleep(1) # Small delay between processing channels 622 | 623 | return all_messages 624 | 625 | async def process_discovered_channels(client, channel_manager, message_depth, keywords, max_channels_per_depth): 626 | channels_processed = 0 627 | while channel_manager.discovered_channels and channels_processed < max_channels_per_depth: 628 | link = channel_manager.get_next_channel() 629 | if await join_channel(client, channel_manager, link): 630 | try: 631 | channel = await client.get_entity(link) 632 | print_info(f"Scraping messages from newly discovered channel: {channel.title}") 633 | await scrape_messages(client, channel, message_depth, keywords, channel_manager) 634 | channels_processed += 1 635 | except Exception as e: 636 | print_error(f"Failed to scrape newly discovered channel {link}: {e}") 637 | 638 | await asyncio.sleep(2) 639 | 640 | if __name__ == "__main__": 641 | banner() 642 | ensure_nltk_data() 643 | 644 | parser = argparse.ArgumentParser(description='Telegram Content Crawler') 645 | parser.add_argument('--config', type=str, default='config.json', help='Path to the configuration file') 646 | parser.add_argument('--message-depth', type=int, default=1000, help='Number of messages to crawl per channel') 647 | parser.add_argument('--channel-depth', type=int, default=2, help='Depth of channel crawling') 648 | parser.add_argument('--api-id', type=str, help='API ID for Telegram client') 649 | parser.add_argument('--api-hash', type=str, help='API hash for Telegram client') 650 | parser.add_argument('--phone-number', type=str, help='Phone number for Telegram client') 651 | args = parser.parse_args() 652 | 653 | config = load_config(args.config) 654 | if config is None: 655 | user_input = input(f"Config file '{args.config}' not found. Create a default config? (y/n): ") 656 | if user_input.lower() == 'y': 657 | config = create_default_config(args.config) 658 | else: 659 | print_error("Please provide a valid config file. Exiting.") 660 | exit(1) 661 | 662 | API_ID = "" 663 | API_HASH = "" 664 | PHONE_NUMBER = "" 665 | 666 | api_id = args.api_id or API_ID 667 | api_hash = args.api_hash or API_HASH 668 | phone_number = args.phone_number or PHONE_NUMBER 669 | 670 | if not api_id or not api_hash or not phone_number: 671 | print_error("API credentials are missing. Please provide them either as command-line arguments or in the script. (Line 664-666)") 672 | exit(1) 673 | 674 | client = TelegramClient('session_name', api_id, api_hash) 675 | 676 | with client: 677 | client.loop.run_until_complete(run_scraper(config, args.message_depth, args.channel_depth)) 678 | --------------------------------------------------------------------------------