├── requirements.txt
├── config.json
├── LICENSE
└── telehunting.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==2.1.4+dfsg
2 | nltk==3.9.1
3 | telethon==1.29.2
4 | colorama==0.4.5
5 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "initial_channel_links": [
 3 |       "https://t.me/",
 4 |       "https://t.me/",
 5 |       "https://t.me/"
 6 |     ],
 7 |     "message_keywords": ["hack", "carding", "malware", "exploit", "cracking"],
 8 |     "batch_size": 100
 9 |   }
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Flare Systems
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/telehunting.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import re
  3 | import pandas as pd
  4 | import nltk
  5 | from nltk.sentiment import SentimentIntensityAnalyzer
  6 | from telethon.sync import TelegramClient
  7 | from telethon.tl.functions.channels import JoinChannelRequest
  8 | from telethon.errors import FloodWaitError, ChannelPrivateError
  9 | from telethon.tl.types import Channel, User, Channel, Chat
 10 | import multiprocessing
 11 | from functools import partial
 12 | import argparse
 13 | import json
 14 | import random
 15 | import signal
 16 | import os
 17 | from datetime import datetime
 18 | from colorama import init, Fore, Back, Style
 19 | 
 20 | init(autoreset=True)
 21 | 
 22 | PURPLE_BLUE = '\033[38;2;100;100;255m'
 23 | LIGHT_PURPLE = '\033[38;2;200;180;255m'
 24 | BOLD_WHITE = '\033[1;37m'
 25 | 
 26 | def print_info(message):
 27 |     print(f"{PURPLE_BLUE}ℹ {BOLD_WHITE}{message}")
 28 | 
 29 | def print_success(message):
 30 |     print(f"{LIGHT_PURPLE}✔ {BOLD_WHITE}{message}")
 31 | 
 32 | def print_warning(message):
 33 |     print(f"{Fore.YELLOW}{Style.BRIGHT}⚠ {BOLD_WHITE}{message}")
 34 | 
 35 | def print_error(message):
 36 |     print(f"{Fore.RED}✘ {message}")
 37 | 
 38 | def print_header(message):
 39 |     print(f"\n{PURPLE_BLUE}{Style.BRIGHT}{message}")
 40 |     print(f"{PURPLE_BLUE}{'-' * len(message)}{Style.RESET_ALL}")
 41 | 
 42 | def print_subheader(message):
 43 |     print(f"\n{LIGHT_PURPLE}{Style.BRIGHT}{message}")
 44 |     print(f"{LIGHT_PURPLE}{'-' * len(message)}{Style.RESET_ALL}")
 45 | 
 46 | def banner():
 47 |     print(f"""
 48 |           
 49 | {Fore.BLUE}{Style.BRIGHT}
 50 | 
 51 | 
 52 |                       +++++                      
 53 |                     ++{LIGHT_PURPLE}=   +{Style.RESET_ALL}{Fore.BLUE}{Style.BRIGHT}+                     
 54 |                     ++{LIGHT_PURPLE}+   ++{Style.RESET_ALL}{Fore.BLUE}{Style.BRIGHT}+                    
 55 |                     +++{LIGHT_PURPLE}+++{Style.RESET_ALL}{Fore.BLUE}{Style.BRIGHT}++*                    
 56 |                     *+++*+***                    
 57 |                      ********                    
 58 |                    {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}**********                   
 59 |                   **{LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT} *********                  
 60 |                  ***{LIGHT_PURPLE}##{Fore.BLUE}{Style.BRIGHT}**********                 
 61 |                *****{LIGHT_PURPLE}###{Fore.BLUE}{Style.BRIGHT}***********{LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}              
 62 |            *********{LIGHT_PURPLE}####{Fore.BLUE} ******{LIGHT_PURPLE}########{Fore.BLUE}{Style.BRIGHT}          
 63 |  ++{LIGHT_PURPLE}+{Fore.BLUE}{Style.BRIGHT}++**************{LIGHT_PURPLE}###   #######{Fore.BLUE}{Style.BRIGHT}  *******++{LIGHT_PURPLE}++{Fore.BLUE}{Style.BRIGHT}++ 
 64 | +{LIGHT_PURPLE}++  +{Fore.BLUE}{Style.BRIGHT}**************{LIGHT_PURPLE}#       ##{Fore.BLUE}{Style.BRIGHT} *************  +{LIGHT_PURPLE}{Fore.BLUE}{Style.BRIGHT}++
 65 | ++{LIGHT_PURPLE}+   +{Fore.BLUE}{Style.BRIGHT}***********  {LIGHT_PURPLE}#       #{Fore.BLUE}{Style.BRIGHT}*************+*  +{LIGHT_PURPLE}{Fore.BLUE}{Style.BRIGHT}++
 66 |  +++{LIGHT_PURPLE}++{Fore.BLUE}{Style.BRIGHT}******** {LIGHT_PURPLE}########   ###{Fore.BLUE}{Style.BRIGHT}*************++{LIGHT_PURPLE}++{Fore.BLUE}{Style.BRIGHT}++ 
 67 |         {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}**{LIGHT_PURPLE}####{Fore.BLUE}{Style.BRIGHT}****** {LIGHT_PURPLE}###{Fore.BLUE}{Style.BRIGHT}***********          
 68 |               ************{LIGHT_PURPLE}###{Fore.BLUE}{Style.BRIGHT}*****               
 69 |                  **********{LIGHT_PURPLE}##{Fore.BLUE}{Style.BRIGHT}***                 
 70 |                   ********* {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}**                  
 71 |                    ********* *                   
 72 |                     ******** {LIGHT_PURPLE}#{Fore.BLUE}{Style.BRIGHT}                   
 73 |                     *********                    
 74 |                     **+{LIGHT_PURPLE}**{Fore.BLUE}{Style.BRIGHT}+***                    
 75 |                     *+{LIGHT_PURPLE}+   +{Fore.BLUE}{Style.BRIGHT}++                    
 76 |                      +{LIGHT_PURPLE}+   +{Fore.BLUE}{Style.BRIGHT}++                    
 77 |                       ++{LIGHT_PURPLE}+{Fore.BLUE}{Style.BRIGHT}++                      
 78 | 
 79 |                     
 80 | 
 81 | 
 82 |    
 83 | {Style.RESET_ALL}
 84 | """)
 85 | 
 86 | # Ensure NLTK data is downloaded
 87 | def ensure_nltk_data():
 88 |     try:
 89 |         nltk.data.find('tokenizers/punkt')
 90 |         nltk.data.find('sentiment/vader_lexicon.zip')
 91 |     except LookupError:
 92 |         print_info("Downloading NLTK data...")
 93 |         nltk.download('punkt', quiet=True)
 94 |         nltk.download('vader_lexicon', quiet=True)
 95 | 
 96 | # Extract Telegram channel links from messages
 97 | def extract_channel_links(text):
 98 |     if not text or not isinstance(text, str):
 99 |         return []
100 |     pattern = r't\.me/(?:joinchat/)?[a-zA-Z0-9_-]+'
101 |     return re.findall(pattern, text)
102 | 
103 | # Clean and format channel links
104 | def clean_link(link):
105 |     if not link or not isinstance(link, str):
106 |         return None
107 |     
108 |     link = link.split(')')[0].strip()
109 |     
110 |     if re.match(r'^[a-zA-Z0-9_]{5,}$', link):
111 |         return link
112 |     
113 |     match = re.search(r't\.me/(?:joinchat/)?([a-zA-Z0-9_-]+)', link)
114 |     if match:
115 |         username_or_hash = match.group(1)
116 |         
117 |         if 'joinchat' in link:
118 |             return f'https://t.me/joinchat/{username_or_hash}'
119 |         
120 |         return username_or_hash
121 |     
122 |     return None
123 | 
124 | # Manage discovered channels
125 | class ChannelManager:
126 |     def __init__(self):
127 |         self.discovered_channels = set()
128 |         self.joined_channels = set()
129 |         self.processed_channels = set()
130 |         self.channel_affiliations = {}
131 |         self.initial_channels = set()
132 | 
133 |     def add_channel(self, link, source_channel=None):
134 |         cleaned_link = clean_link(link)
135 |         if cleaned_link and cleaned_link not in self.joined_channels and cleaned_link not in self.processed_channels:
136 |             self.discovered_channels.add(cleaned_link)
137 |             if source_channel:
138 |                 self.channel_affiliations[cleaned_link] = source_channel
139 |             else:
140 |                 self.initial_channels.add(cleaned_link)  # Mark as initial channel if no source
141 | 
142 |     def mark_as_joined(self, link):
143 |         cleaned_link = clean_link(link)
144 |         if cleaned_link:
145 |             self.joined_channels.add(cleaned_link)
146 |             self.discovered_channels.discard(cleaned_link)
147 | 
148 |     def mark_as_processed(self, link):
149 |         cleaned_link = clean_link(link)
150 |         if cleaned_link:
151 |             self.processed_channels.add(cleaned_link)
152 |             self.discovered_channels.discard(cleaned_link)
153 | 
154 |     def has_unprocessed_channels(self):
155 |         return len(self.discovered_channels) > 0
156 | 
157 |     def get_next_channel(self):
158 |         if self.discovered_channels:
159 |             return self.discovered_channels.pop()
160 |         return None
161 | 
162 |     def get_affiliation(self, link):
163 |         cleaned_link = clean_link(link)
164 |         return self.channel_affiliations.get(cleaned_link, None)
165 | 
166 |     def display_status(self):
167 |         print_subheader("Channel Status")
168 |         print(f"  Channels waiting to be processed: {len(self.discovered_channels)}")
169 |         print(f"  Channels joined: {len(self.joined_channels)}")
170 |         print(f"  Channels processed: {len(self.processed_channels)}")
171 | 
172 | # Join channel by url
173 | async def join_channel(client, channel_manager, link, max_retries=3):
174 |     cleaned_link = clean_link(link)
175 |     if not cleaned_link:
176 |         print_warning(f"Invalid link format: {link}")
177 |         return False
178 | 
179 |     retries = 0
180 |     while retries < max_retries:
181 |         try:
182 |             entity = await client.get_entity(cleaned_link)
183 |             entity_name = await get_entity_name(entity)
184 |             
185 |             if isinstance(entity, (Channel, Chat)):
186 |                 if entity.username:
187 |                     await client(JoinChannelRequest(entity))
188 |                 else:
189 |                     print_warning(f"Cannot join private channel {entity_name} without an invite link")
190 |                     return False
191 |             elif isinstance(entity, User):
192 |                 print_info(f"Entity {entity_name} is a user, no need to join")
193 |             else:
194 |                 print_warning(f"Unknown entity type for {entity_name}")
195 |                 return False
196 |             
197 |             print_success(f"Successfully processed entity: {entity_name}")
198 |             channel_manager.mark_as_joined(cleaned_link)
199 |             return True
200 | 
201 |         except FloodWaitError as e:
202 |             wait_time = min(e.seconds, 30)
203 |             print_warning(f"FloodWaitError encountered. Waiting for {wait_time} seconds. (Attempt {retries + 1}/{max_retries})")
204 |             await asyncio.sleep(wait_time)
205 |         except Exception as e:
206 |             print_error(f"Failed to process entity {cleaned_link}: {e}")
207 |         
208 |         retries += 1
209 |         await asyncio.sleep(1)
210 | 
211 |     print_warning(f"Max retries exceeded. Failed to process entity: {cleaned_link}")
212 |     return False
213 | 
214 | # Load configuration
215 | def load_config(config_path):
216 |     if os.path.exists(config_path):
217 |         with open(config_path, 'r') as f:
218 |             return json.load(f)
219 |     return None
220 | 
221 | # Create a default config file, if no config present (providing one anyways for clarity sake)
222 | def create_default_config(config_path):
223 |     default_config = {
224 |         "initial_channel_links": [],
225 |         "message_keywords": [],
226 |         "batch_size": 100
227 |     }
228 |     with open(config_path, 'w') as f:
229 |         json.dump(default_config, f, indent=4)
230 |     print_success(f"Default config file created at {config_path}")
231 |     print_info("Please edit this file with your channel links and keywords.")
232 |     return default_config
233 | 
234 | # Home made sentiment lexicon (this is my first time doing this, it may suck)
235 | class CybersecuritySentimentAnalyzer:
236 |     def __init__(self):
237 |         self.sia = SentimentIntensityAnalyzer()
238 |         self.cybersecurity_lexicon = {
239 |             'vulnerability': 2.0,
240 |             'exploit': -3.0,
241 |             'patch': 2.0,
242 |             'hack': -2.0,
243 |             'secure': 3.0,
244 |             'breach': -4.0,
245 |             'protect': 3.0,
246 |             'malware': -3.0,
247 |             'ransomware': -4.0,
248 |             'encryption': 2.0,
249 |             'backdoor': -3.0,
250 |             'firewall': 2.0,
251 |             'phishing': -3.0,
252 |             'authentication': 2.0,
253 |             'threat': -2.0,
254 |             'zero-day': -4.0,
255 |             'security': 1.0,
256 |             'attack': -2.0,
257 |             'defense': 2.0,
258 |             'compromise': -3.0
259 |         }
260 |         self.sia.lexicon.update(self.cybersecurity_lexicon)
261 | 
262 |     def polarity_scores(self, text):
263 |         return self.sia.polarity_scores(text)
264 | 
265 | # Global variables
266 | current_batch = []
267 | batch_counter = 1
268 | 
269 | # keyboard interrupt (Ctrl+C)
270 | def signal_handler(sig, frame):
271 |     global current_batch, batch_counter
272 |     print_warning(f"\nKeyboard interrupt received. Saving current batch and exiting...")
273 |     save_current_batch(current_batch, batch_counter)
274 |     exit(0)
275 | 
276 | # Save current batch to CSV
277 | def save_current_batch(batch, batch_counter):
278 |     if batch:
279 |         df = pd.DataFrame(batch, columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment'])
280 |         
281 |         # If sentiment analysis hasn't been done, do it now
282 |         if df['Sentiment'].isnull().all():
283 |             cybersecurity_sia = CybersecuritySentimentAnalyzer()
284 |             df['Sentiment'] = df['Message'].apply(cybersecurity_sia.polarity_scores)
285 |             df['Compound_Sentiment'] = df['Sentiment'].apply(lambda x: x['compound'] if isinstance(x, dict) else None)
286 |         
287 |         batch_filename = f"telegram_scraped_messages_batch_{batch_counter}.csv"
288 |         df.to_csv(batch_filename, index=False)
289 |         print_success(f"Saved batch {batch_counter} with {len(batch)} messages to {batch_filename}")
290 |     else:
291 |         print_info(f"No messages in the current batch.")
292 | 
293 | # generate sentiment report
294 | def generate_sentiment_report(df):
295 |     try:
296 |         # Ensure Compound_Sentiment is float
297 |         df['Compound_Sentiment'] = pd.to_numeric(df['Compound_Sentiment'], errors='coerce')
298 |         
299 |         # Calculate average sentiment scores
300 |         avg_sentiment = pd.DataFrame(df['Sentiment'].dropna().tolist()).mean()
301 |         
302 |         # Categorise messages based on compound sentiment
303 |         df['Sentiment_Category'] = df['Compound_Sentiment'].apply(lambda x: 
304 |             'High Alert' if x <= -0.5 else
305 |             'Potential Threat' if -0.5 < x <= -0.1 else
306 |             'Neutral' if -0.1 < x < 0.1 else
307 |             'Potentially Positive' if 0.1 <= x < 0.5 else
308 |             'Very Positive'
309 |         )
310 |         sentiment_counts = df['Sentiment_Category'].value_counts()
311 |         total_messages = len(df)
312 | 
313 |         # Calculate overall sentiment score
314 |         overall_score = avg_sentiment.get('compound', 0) * 100
315 | 
316 |         report = f"""
317 | Sentiment Analysis Report
318 | {'-' * 50}
319 | Total messages analyzed: {total_messages}
320 | 
321 | Overall Sentiment Score: {overall_score:.1f}/100
322 | Interpretation: 
323 | {interpret_overall_score(overall_score)}
324 | 
325 | Message Sentiment Breakdown:
326 | """
327 | 
328 |         categories = [
329 |             ('High Alert', "Severe Threats"),
330 |             ('Potential Threat', "Potential Threats"),
331 |             ('Neutral', "Neutral Messages"),
332 |             ('Potentially Positive', "Potentially Positive"),
333 |             ('Very Positive', "Strong Security Indicators")
334 |         ]
335 | 
336 |         for category, description in categories:
337 |             count = sentiment_counts.get(category, 0)
338 |             percentage = (count / total_messages) * 100
339 |             report += f"{category} ({description}): {count} messages ({percentage:.1f}%)\n"
340 | 
341 |         report += f"\nTop 5 Most Concerning Messages (Potential Threats):\n"
342 | 
343 |         for _, row in df.nsmallest(5, 'Compound_Sentiment').iterrows():
344 |             threat_level = abs(row['Compound_Sentiment']) * 100
345 |             report += f"- {row['Message'][:100]}... (Threat Level: {threat_level:.1f}/100)\n"
346 | 
347 |         report += f"\nTop 5 Most Positive Messages (Potential Security Improvements):\n"
348 | 
349 |         for _, row in df.nlargest(5, 'Compound_Sentiment').iterrows():
350 |             positivity_level = row['Compound_Sentiment'] * 100
351 |             report += f"- {row['Message'][:100]}... (Positivity Level: {positivity_level:.1f}/100)\n"
352 | 
353 |         with open('sentiment_report.txt', 'w', encoding='utf-8') as f:
354 |             f.write(report)
355 | 
356 |         print_success("Sentiment analysis report generated and saved to 'sentiment_report.txt'")
357 |         
358 |         # Print the sentiment category counts to the console with colors
359 |         print_info("Sentiment Category Counts:")
360 |         for category, description in categories:
361 |             count = sentiment_counts.get(category, 0)
362 |             percentage = (count / total_messages) * 100
363 |             color = get_category_color(category)
364 |             print(f"{color}{category}: {count} ({percentage:.1f}%){Style.RESET_ALL}")
365 | 
366 |     except Exception as e:
367 |         print_error(f"Error generating sentiment report: {e}")
368 |         print_error(f"DataFrame info:\n{df.info()}")
369 | 
370 | def get_category_color(category):
371 |     color_map = {
372 |         'High Alert': Fore.RED,
373 |         'Potential Threat': Fore.YELLOW,
374 |         'Neutral': Fore.WHITE,
375 |         'Potentially Positive': Fore.LIGHTGREEN_EX,
376 |         'Very Positive': Fore.GREEN
377 |     }
378 |     return color_map.get(category, '')
379 | 
380 | def interpret_overall_score(score):
381 |     if score <= -50:
382 |         return "Critical situation. Numerous severe threats detected. Immediate action required."
383 |     elif -50 < score <= -10:
384 |         return "Concerning situation. Multiple potential threats identified. Heightened vigilance needed."
385 |     elif -10 < score < 10:
386 |         return "Neutral situation. No significant threats or improvements detected. Maintain standard security measures."
387 |     elif 10 <= score < 50:
388 |         return "Positive situation. Some potential security improvements identified. Consider implementing suggested measures."
389 |     else:
390 |         return "Very positive situation. Strong security indicators present. Continue current security practices and look for areas of improvement."
391 | 
392 | def analyze_sentiment(cybersecurity_sia, message):
393 |     return cybersecurity_sia.polarity_scores(message)
394 | 
395 | def process_messages(messages, num_processes=multiprocessing.cpu_count()):
396 |     df = pd.DataFrame(messages, columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment'])
397 |     
398 |     cybersecurity_sia = CybersecuritySentimentAnalyzer()
399 |     
400 |     # Parallelize sentiment analysis
401 |     with multiprocessing.Pool(processes=num_processes) as pool:
402 |         partial_analyze = partial(analyze_sentiment, cybersecurity_sia)
403 |         df['Sentiment'] = pool.map(partial_analyze, df['Message'])
404 |     
405 |     df['Compound_Sentiment'] = df['Sentiment'].apply(lambda x: x['compound'])
406 |     
407 |     generate_sentiment_report(df)
408 |     return df
409 | 
410 | async def get_entity_name(entity):
411 |     if isinstance(entity, User):
412 |         return f"@{entity.username}" if entity.username else f"User({entity.id})"
413 |     elif isinstance(entity, (Channel, Chat)):
414 |         return entity.title or f"Channel({entity.id})"
415 |     else:
416 |         return f"Unknown({type(entity).__name__})"
417 | 
418 | async def scrape_messages(client, entity, message_limit, keywords, channel_manager, affiliated_channel=None):
419 |     messages = []
420 |     try:
421 |         entity_name = await get_entity_name(entity)
422 |         async for message in client.iter_messages(entity, limit=message_limit):
423 |             if message.text:
424 |                 if affiliated_channel:
425 |                     print_info(f"Message from {Fore.CYAN}{Style.BRIGHT}{entity_name}{Style.RESET_ALL}.{Fore.YELLOW}{Style.BRIGHT} <-- {affiliated_channel}{Style.RESET_ALL}: {message.text}")
426 |                 else:
427 |                     print_info(f"Message from {Fore.CYAN}{Style.BRIGHT}{entity_name}{Style.RESET_ALL}: {message.text}")
428 |                 messages.append([message.sender_id, message.date, message.text, None, None])
429 |                 
430 |                 # Process t.me links in the message
431 |                 links = extract_channel_links(message.text)
432 |                 for link in links:
433 |                     channel_manager.add_channel(link, source_channel=entity_name)
434 |             
435 |             await asyncio.sleep(0.1)
436 |     except FloodWaitError as e:
437 |         print_warning(f"FloodWaitError in scrape_messages: {e}")
438 |         await asyncio.sleep(min(e.seconds, 30))
439 |     except Exception as e:
440 |         print_error(f"Error scraping entity {entity_name}: {e}")
441 |     
442 |     return messages, entity_name
443 | 
444 | async def process_channels(client, channel_manager, message_depth, keywords, batch_processor):
445 |     while channel_manager.has_unprocessed_channels():
446 |         link = channel_manager.get_next_channel()
447 |         affiliated_channel = channel_manager.get_affiliation(link)
448 |         try:
449 |             join_success = await retry_with_backoff(join_channel(client, channel_manager, link))
450 |             if join_success:
451 |                 entity = await client.get_entity(link)
452 |                 entity_messages, channel_name = await scrape_messages(client, entity, message_depth, keywords, channel_manager, affiliated_channel)
453 |                 
454 |                 # Add messages to batch processor with channel name and affiliation
455 |                 batch_processor.add_messages(entity_messages, channel_name, affiliated_channel)
456 |             else:
457 |                 print_warning(f"Skipping entity {link} due to joining failure")
458 |         except Exception as e:
459 |             print_error(f"Failed to process entity {link}: {e}")
460 |         finally:
461 |             channel_manager.mark_as_processed(link)
462 |         
463 |         await asyncio.sleep(1)  # Small delay between processing channels
464 | 
465 | async def process_single_channel(client, channel_manager, link, message_depth, keywords):
466 |     try:
467 |         join_success = await retry_with_backoff(join_channel(client, channel_manager, link))
468 |         if join_success:
469 |             entity = await client.get_entity(link)
470 |             entity_name = await get_entity_name(entity)
471 |             print_info(f"Scraping messages from: {entity_name}")
472 |             entity_messages = await scrape_messages(client, entity, message_depth, keywords, channel_manager)
473 |             return entity_messages
474 |         else:
475 |             print_warning(f"Skipping entity {link} due to joining failure")
476 |     except Exception as e:
477 |         print_error(f"Failed to process entity {link}: {e}")
478 |     return []
479 | 
480 | async def retry_with_backoff(coroutine, max_retries=5, base_delay=1, max_delay=60):
481 |     retries = 0
482 |     while True:
483 |         try:
484 |             return await coroutine
485 |         except FloodWaitError as e:
486 |             if retries >= max_retries:
487 |                 raise
488 |             delay = min(base_delay * (2 ** retries) + random.uniform(0, 1), max_delay)
489 |             print_warning(f"FloodWaitError encountered. Retrying in {delay:.2f} seconds. (Attempt {retries + 1}/{max_retries})")
490 |             await asyncio.sleep(delay)
491 |             retries += 1
492 |         except Exception as e:
493 |             print_error(f"Unexpected error: {e}")
494 |             raise
495 | 
496 | 
497 | 
498 | class BatchProcessor:
499 |     def __init__(self, batch_size=1000, cybersecurity_sia=None):
500 |         self.batch = []
501 |         self.batch_size = batch_size
502 |         self.batch_counter = 1
503 |         self.total_messages = 0
504 |         self.cybersecurity_sia = cybersecurity_sia or CybersecuritySentimentAnalyzer()
505 |         self.all_messages_df = pd.DataFrame(columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment', 'Channel Name', 'Affiliated Channel'])
506 | 
507 |     def add_messages(self, messages, channel_name, affiliated_channel):
508 |         messages_with_info = [
509 |             message + [channel_name, affiliated_channel if affiliated_channel else "Initial Config"]
510 |             for message in messages
511 |         ]
512 |         self.batch.extend(messages_with_info)
513 |         self.total_messages += len(messages)
514 |         if len(self.batch) >= self.batch_size:
515 |             self.save_batch()
516 | 
517 |     def save_batch(self):
518 |         if self.batch:
519 |             df = pd.DataFrame(self.batch, columns=['Sender ID', 'Date', 'Message', 'Sentiment', 'Compound_Sentiment', 'Channel Name', 'Affiliated Channel'])
520 |             df['Sentiment'] = df['Message'].apply(self.cybersecurity_sia.polarity_scores)
521 |             df['Compound_Sentiment'] = df['Sentiment'].apply(lambda x: x['compound']).astype(float)
522 |             
523 |             batch_filename = f"telegram_scraped_messages_batch_{self.batch_counter}.csv"
524 |             df.to_csv(batch_filename, index=False)
525 |             print_success(f"Saved batch {self.batch_counter} with {len(self.batch)} messages to {batch_filename}")
526 |             
527 |             # Ensure consistent dtypes
528 |             for col in df.columns:
529 |                 if col in self.all_messages_df.columns:
530 |                     df[col] = df[col].astype(self.all_messages_df[col].dtype)
531 |             
532 |             self.all_messages_df = pd.concat([self.all_messages_df, df], ignore_index=True)
533 |             
534 |             self.batch = []
535 |             self.batch_counter += 1
536 | 
537 |     def generate_final_report(self):
538 |         print_info(f"Generating final report. Total messages: {len(self.all_messages_df)}")
539 |         
540 |         if self.all_messages_df.empty:
541 |             print_warning("No messages to generate report from.")
542 |             return
543 |         
544 |         generate_sentiment_report(self.all_messages_df)
545 | 
546 |     def finalize(self):
547 |         self.save_batch()  # Save any remaining messages
548 |         self.generate_final_report()
549 | 
550 |     def __del__(self):
551 |         self.save_batch()  # Save any remaining messages when the object is destroyed
552 | 
553 | # pretty much our main func at this point
554 | async def run_scraper(config, message_depth, channel_depth):
555 |     await client.start()
556 |     
557 |     signal.signal(signal.SIGINT, signal_handler)
558 |     
559 |     try:
560 |         channel_manager = ChannelManager()
561 |         cybersecurity_sia = CybersecuritySentimentAnalyzer()
562 |         batch_processor = BatchProcessor(cybersecurity_sia=cybersecurity_sia)
563 |         
564 |         # Add initial channels from config
565 |         for link in config['initial_channel_links']:
566 |             channel_manager.add_channel(link)
567 |         
568 |         start_time = datetime.now()
569 |         print_header(f"Scraping started at {start_time}")
570 | 
571 |         depth = 0
572 |         while channel_manager.has_unprocessed_channels() and depth < channel_depth:
573 |             print_subheader(f"Crawling at depth {depth + 1}/{channel_depth}")
574 |             channel_manager.display_status()
575 |             
576 |             await process_channels(client, channel_manager, message_depth, config['message_keywords'], batch_processor)
577 |             
578 |             depth += 1
579 |             
580 |             # Allow time for rate limiting
581 |             await asyncio.sleep(5)
582 |         
583 |         end_time = datetime.now()
584 |         duration = end_time - start_time
585 |         print_header(f"Scraping completed at {end_time}")
586 |         print_info(f"Total duration: {duration}")
587 |         print_info(f"Total messages scraped: {batch_processor.total_messages}")
588 |         print_info(f"Total channels processed: {len(channel_manager.processed_channels)}")
589 | 
590 |         # Finalize batch processing and generate report
591 |         batch_processor.finalize()
592 | 
593 |     except Exception as e:
594 |         print_error(f"An error occurred during scraping: {e}")
595 |     finally:
596 |         await client.disconnect()
597 | 
598 | async def process_all_channels(client, channel_manager, message_depth, keywords):
599 |     all_messages = []
600 |     channels_to_process = list(channel_manager.discovered_channels)
601 |     
602 |     for link in channels_to_process:
603 |         try:
604 |             join_success = await retry_with_backoff(join_channel(client, channel_manager, link))
605 |             if join_success:
606 |                 entity = await client.get_entity(link)
607 |                 entity_name = await get_entity_name(entity)
608 |                 print_info(f"Scraping messages from: {entity_name}")
609 |                 entity_messages = await scrape_messages(client, entity, message_depth, keywords, channel_manager)
610 |                 all_messages.extend(entity_messages)
611 |                 
612 |                 # Process newly discovered channels
613 |                 new_channels = channel_manager.get_new_channels()
614 |                 for new_link in new_channels:
615 |                     channel_manager.add_channel(new_link)
616 |             else:
617 |                 print_warning(f"Skipping entity {link} due to joining failure")
618 |         except Exception as e:
619 |             print_error(f"Failed to process entity {link}: {e}")
620 |         
621 |         await asyncio.sleep(1)  # Small delay between processing channels
622 |     
623 |     return all_messages
624 | 
625 | async def process_discovered_channels(client, channel_manager, message_depth, keywords, max_channels_per_depth):
626 |     channels_processed = 0
627 |     while channel_manager.discovered_channels and channels_processed < max_channels_per_depth:
628 |         link = channel_manager.get_next_channel()
629 |         if await join_channel(client, channel_manager, link):
630 |             try:
631 |                 channel = await client.get_entity(link)
632 |                 print_info(f"Scraping messages from newly discovered channel: {channel.title}")
633 |                 await scrape_messages(client, channel, message_depth, keywords, channel_manager)
634 |                 channels_processed += 1
635 |             except Exception as e:
636 |                 print_error(f"Failed to scrape newly discovered channel {link}: {e}")
637 |         
638 |         await asyncio.sleep(2)
639 | 
640 | if __name__ == "__main__":
641 |     banner()
642 |     ensure_nltk_data()
643 | 
644 |     parser = argparse.ArgumentParser(description='Telegram Content Crawler')
645 |     parser.add_argument('--config', type=str, default='config.json', help='Path to the configuration file')
646 |     parser.add_argument('--message-depth', type=int, default=1000, help='Number of messages to crawl per channel')
647 |     parser.add_argument('--channel-depth', type=int, default=2, help='Depth of channel crawling')
648 |     parser.add_argument('--api-id', type=str, help='API ID for Telegram client')
649 |     parser.add_argument('--api-hash', type=str, help='API hash for Telegram client')
650 |     parser.add_argument('--phone-number', type=str, help='Phone number for Telegram client')
651 |     args = parser.parse_args()
652 | 
653 |     config = load_config(args.config)
654 |     if config is None:
655 |         user_input = input(f"Config file '{args.config}' not found. Create a default config? (y/n): ")
656 |         if user_input.lower() == 'y':
657 |             config = create_default_config(args.config)
658 |         else:
659 |             print_error("Please provide a valid config file. Exiting.")
660 |             exit(1)
661 | 
662 |     API_ID = ""
663 |     API_HASH = ""
664 |     PHONE_NUMBER = ""
665 | 
666 |     api_id = args.api_id or API_ID
667 |     api_hash = args.api_hash or API_HASH
668 |     phone_number = args.phone_number or PHONE_NUMBER
669 | 
670 |     if not api_id or not api_hash or not phone_number:
671 |         print_error("API credentials are missing. Please provide them either as command-line arguments or in the script. (Line 664-666)")
672 |         exit(1)
673 | 
674 |     client = TelegramClient('session_name', api_id, api_hash)
675 | 
676 |     with client:
677 |         client.loop.run_until_complete(run_scraper(config, args.message_depth, args.channel_depth))
678 | 


--------------------------------------------------------------------------------