├── .env
├── .pre-commit-config.yaml
├── README.md
├── channels.txt
├── contact_patterns.txt
├── example.env
├── example_config.yaml
├── logo.png
├── pytest.ini
├── requirements.txt
├── scripts
    └── run.py
├── setup.cfg
├── setup.py
├── telegraphite
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-313.pyc
    │   ├── cli.cpython-313.pyc
    │   ├── client.cpython-313.pyc
    │   ├── contact_extractor.cpython-313.pyc
    │   ├── errors.cpython-313.pyc
    │   ├── fetcher.cpython-313.pyc
    │   ├── logging_config.cpython-313.pyc
    │   └── store.cpython-313.pyc
    ├── cli.py
    ├── client.py
    ├── config.py
    ├── contact_extractor.py
    ├── errors.py
    ├── fetcher.py
    ├── logging_config.py
    └── store.py
└── tests
    ├── __init__.py
    ├── test_fetcher.py
    └── test_store.py


/.env:
--------------------------------------------------------------------------------
1 | API_ID=
2 | API_HASH=


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: end-of-file-fixer
 7 |     -   id: check-yaml
 8 |     -   id: check-added-large-files
 9 | 
10 | -   repo: https://github.com/psf/black
11 |     rev: 23.3.0
12 |     hooks:
13 |     -   id: black
14 |         args: [--line-length=88]
15 | 
16 | -   repo: https://github.com/pycqa/isort
17 |     rev: 5.12.0
18 |     hooks:
19 |     -   id: isort
20 |         args: ["--profile", "black", "--filter-files"]
21 | 
22 | -   repo: https://github.com/pycqa/flake8
23 |     rev: 6.0.0
24 |     hooks:
25 |     -   id: flake8
26 |         additional_dependencies: [flake8-docstrings]
27 |         args: ["--max-line-length=88", "--extend-ignore=E203"]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TeleGraphite: Telegram Scraper & JSON Exporter & telegram chanels scraper
  2 | 
  3 | 
  4 | A tool to fetch and save posts from public Telegram channels.
  5 | ![TeleGraphite Screenshot](logo.png)
  6 | 
  7 | ## Features
  8 | 
  9 | - Fetch posts from multiple Telegram channels
 10 | - Save posts as JSON files (with contact exports: emails, phone numbers, links)
 11 | - Download and save media files (photos, documents videos)
 12 | - Deduplicate posts to avoid saving the same content twice
 13 | - Run once or continuously with a specified interval
 14 | - Filter posts by keywords or content type (text-only, media-only)
 15 | - Schedule fetching at specific days and times
 16 | 
 17 | ## Installation
 18 | 
 19 | ### From Source
 20 | 
 21 | ```bash
 22 | # Clone the repository
 23 | git clone https://github.com/hamodywe/telegraphite.git
 24 | cd telegraphite
 25 | 
 26 | # Install the package
 27 | pip install -e .
 28 | ```
 29 | 
 30 | ### Using pip
 31 | 
 32 | ```bash
 33 | pip install telegraphite
 34 | ```
 35 | 
 36 | ## Setup
 37 | 
 38 | 1. Create a Telegram API application:
 39 |    - Go to https://my.telegram.org/
 40 |    - Log in with your phone number
 41 |    - Go to 'API development tools'
 42 |    - Create a new application
 43 |    - Note your API ID and API Hash
 44 | 
 45 | 2. Create a `.env` file in your project directory with the following content:
 46 | 
 47 | ```
 48 | API_ID=your_api_id
 49 | API_HASH=your_api_hash
 50 | ```
 51 | 
 52 | 3. Create a `channels.txt` file with one channel username per line:
 53 | 
 54 | ```
 55 | @channel1
 56 | @channel2
 57 | channel3
 58 | ```
 59 | 
 60 | ## Usage
 61 | 
 62 | ### Command Line Interface
 63 | 
 64 | TeleGraphite provides a command-line interface for fetching posts:
 65 | 
 66 | ```bash
 67 | # Fetch posts once and exit
 68 | telegraphite once
 69 | 
 70 | # Fetch posts continuously with a 1-hour interval
 71 | telegraphite continuous --interval 3600
 72 | ```
 73 | 
 74 | ### Options
 75 | 
 76 | ```
 77 | -c, --channels-file  Path to file containing channel usernames (default: channels.txt)
 78 | -d, --data-dir       Directory to store posts and media (default: data)
 79 | -e, --env-file       Path to .env file with API credentials (default: .env)
 80 | -l, --limit          Maximum number of posts to fetch per channel (default: 10)
 81 | -v, --verbose        Enable verbose logging
 82 | -i, --interval       Interval between fetches in seconds (default: 3600, only for continuous mode)
 83 | --config             Path to YAML configuration file
 84 | 
 85 | # Filter options
 86 | --keywords           Filter posts containing specific keywords
 87 | --media-only         Only fetch posts containing media (photos, documents)
 88 | --text-only          Only fetch posts containing text
 89 | 
 90 | # Schedule options
 91 | --days               Days of the week to run the fetcher (monday, tuesday, etc.)
 92 | --times              Times of day to run the fetcher in HH:MM format
 93 | ```
 94 | 
 95 | ### Configuration File
 96 | 
 97 | You can also use a YAML configuration file to specify options:
 98 | 
 99 | ```yaml
100 | # Directory to store posts and media
101 | data_dir: data
102 | 
103 | # Path to file containing channel usernames
104 | channels_file: channels.txt
105 | 
106 | # Maximum number of posts to fetch per channel
107 | limit: 10
108 | 
109 | # Interval between fetches in seconds (for continuous mode)
110 | interval: 3600
111 | 
112 | # Filters for posts
113 | filters:
114 |   # Keywords to filter posts (only fetch posts containing these keywords)
115 |   keywords:
116 |     - important
117 |     - announcement
118 |   # Only fetch posts containing media (photos, documents)
119 |   media_only: false
120 |   # Only fetch posts containing text
121 |   text_only: false
122 | 
123 | # Schedule for fetching posts (for continuous mode)
124 | schedule:
125 |   # Days of the week to run the fetcher
126 |   days:
127 |     - monday
128 |     - wednesday
129 |     - friday
130 |   # Times of day to run the fetcher (HH:MM format)
131 |   times:
132 |     - "09:00"
133 |     - "18:00"
134 | ```
135 | 
136 | To use a configuration file:
137 | 
138 | ```bash
139 | telegraphite --config config.yaml once
140 | ```
141 | 
142 | Command-line arguments will override settings in the configuration file.
143 | 
144 | ### Examples
145 | 
146 | ```bash
147 | # Fetch 20 posts from each channel and save to custom directory
148 | telegraphite once --limit 20 --data-dir custom_data
149 | 
150 | # Use custom channels file and environment file
151 | telegraphite once --channels-file my_channels.txt --env-file my_env.env
152 | 
153 | # Run continuously with 30-minute interval and verbose logging
154 | telegraphite continuous --interval 1800 --verbose
155 | 
156 | # Fetch only posts containing specific keywords
157 | telegraphite once --keywords announcement important news
158 | 
159 | # Fetch only posts containing media
160 | telegraphite once --media-only
161 | 
162 | # Run continuously on specific days and times
163 | telegraphite continuous --days monday wednesday friday --times 09:00 18:00
164 | 
165 | # Combine filters and scheduling
166 | telegraphite continuous --keywords important --media-only --days monday friday --times 12:00
167 | ```
168 | 
169 | ## Data Structure
170 | 
171 | Posts and media are saved in the following structure:
172 | 
173 | ```
174 | data/
175 |   channel1/
176 |     posts.json
177 |     media/
178 |       20230101_123456_123.jpg
179 |       20230101_123456_124.pdf
180 |   channel2/
181 |     posts.json
182 |     media/
183 |       ...
184 | ```
185 | 
186 | Each `posts.json` file contains an array of post objects with the following structure:
187 | 
188 | ```json
189 | [
190 |   {
191 |     "channel": "channel1",
192 |     "post_id": 123,
193 |     "date": "2023-01-01T12:34:56Z",
194 |     "text": "Post content",
195 |     "images": ["media/20230101_123456_123.jpg"]
196 |   },
197 |   ...
198 | ]
199 | ```
200 | 
201 | ## License
202 | 
203 | MIT
204 | 


--------------------------------------------------------------------------------
/channels.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/channels.txt


--------------------------------------------------------------------------------
/contact_patterns.txt:
--------------------------------------------------------------------------------
 1 | # Email, Phone Number, and Link Patterns
 2 | # Use this file to define email, phone number, and link patterns for extraction
 3 | 
 4 | # Email patterns (must contain @ and be in English)
 5 | [email_patterns]
 6 | .*@.*
 7 | 
 8 | # International phone number patterns
 9 | [phone_patterns]
10 | # International format with + prefix
11 | \+[1-9][0-9]{1,14}
12 | # International format with 00 prefix
13 | 00[1-9][0-9]{1,14}
14 | 
15 | ^07[0-9]{8}$
16 | 07[0-9]{8}\b
17 | # North American format
18 | [1]?[\s-]?\(?[0-9]{3}\)?[\s-]?[0-9]{3}[\s-]?[0-9]{4}
19 | # European format
20 | [0-9]{2,4}[\s-]?[0-9]{2,4}[\s-]?[0-9]{2,4}[\s-]?[0-9]{2,4}
21 | # Generic local format - minimum 8 digits to avoid short numbers
22 | [0-9]{8,15}
23 | 
24 | # Link patterns
25 | [link_patterns]
26 | # HTTP/HTTPS URLs
27 | https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[\w\-\._~:/?#[\]@!$&'()*+,;=]*
28 | # URLs with www but no protocol
29 | www\.(?:[-\w.]|(?:%[\da-fA-F]{2}))+[\w\-\._~:/?#[\]@!$&'()*+,;=]*
30 | # Common URL shorteners without protocol
31 | (?:bit\.ly|t\.co|goo\.gl|tinyurl\.com)/[\w\-\._~:/?#[\]@!$&'()*+,;=]*


--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
1 | # Telegram API credentials
2 | # Get these from https://my.telegram.org/
3 | API_ID=your_api_id_here
4 | API_HASH=your_api_hash_here


--------------------------------------------------------------------------------
/example_config.yaml:
--------------------------------------------------------------------------------
 1 | # TeleGraphite Configuration
 2 | 
 3 | # Directory to store posts and media
 4 | data_dir: data
 5 | 
 6 | # Path to file containing channel usernames
 7 | channels_file: channels.txt
 8 | 
 9 | # Maximum number of posts to fetch per channel
10 | limit: 10
11 | 
12 | # Interval between fetches in seconds (for continuous mode)
13 | interval: 3600
14 | 
15 | # Filters for posts
16 | filters:
17 |   # Keywords to filter posts (only fetch posts containing these keywords)
18 |   keywords:
19 |     - important
20 |     - announcement
21 |   # Only fetch posts containing media (photos, documents)
22 |   media_only: false
23 |   # Only fetch posts containing text
24 |   text_only: false
25 | 
26 | # Schedule for fetching posts (for continuous mode)
27 | schedule:
28 |   # Days of the week to run the fetcher
29 |   days:
30 |     - monday
31 |     - wednesday
32 |     - friday
33 |   # Times of day to run the fetcher (HH:MM format)
34 |   times:
35 |     - "09:00"
36 |     - "18:00"


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/logo.png


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files = test_*.py
3 | python_classes = Test*
4 | python_functions = test_*
5 | testpaths = tests
6 | addopts = -v


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | telethon>=1.24.0
 3 | python-dotenv>=0.19.0
 4 | pyyaml>=6.0
 5 | 
 6 | # Test dependencies
 7 | pytest>=7.0.0
 8 | pytest-asyncio>=0.20.0
 9 | pytest-cov>=4.0.0
10 | 
11 | # Development dependencies
12 | black>=23.3.0
13 | isort>=5.12.0
14 | flake8>=6.0.0
15 | flake8-docstrings>=1.7.0
16 | pre-commit>=3.3.0


--------------------------------------------------------------------------------
/scripts/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Run script for TeleGraphite.
 4 | 
 5 | This script provides a simple way to run the TeleGraphite tool.
 6 | It can be used to fetch posts from Telegram channels once or continuously.
 7 | """
 8 | 
 9 | import argparse
10 | import asyncio
11 | import logging
12 | import os
13 | import sys
14 | from pathlib import Path
15 | 
16 | # Add parent directory to path to allow importing telegraphite
17 | sys.path.insert(0, str(Path(__file__).parent.parent))
18 | 
19 | from telegraphite.cli import run_once, run_continuous, parse_args
20 | from telegraphite.logging_config import configure_logging
21 | 
22 | 
23 | def main():
24 |     """Run the TeleGraphite tool."""
25 |     # Parse command-line arguments
26 |     parser = argparse.ArgumentParser(description="Fetch posts from Telegram channels")
27 |     parser.add_argument(
28 |         "mode",
29 |         choices=["once", "continuous"],
30 |         help="Run once or continuously with a specified interval",
31 |     )
32 |     parser.add_argument(
33 |         "-c", "--channels-file", default="channels.txt", help="Path to file containing channel usernames"
34 |     )
35 |     parser.add_argument(
36 |         "-d", "--data-dir", default="data", help="Directory to store posts and media"
37 |     )
38 |     parser.add_argument(
39 |         "-e", "--env-file", default=".env", help="Path to .env file with API credentials"
40 |     )
41 |     parser.add_argument(
42 |         "-l", "--limit", type=int, default=10, help="Maximum number of posts to fetch per channel"
43 |     )
44 |     parser.add_argument(
45 |         "-v", "--verbose", action="store_true", help="Enable verbose logging"
46 |     )
47 |     parser.add_argument(
48 |         "-i", "--interval", type=int, default=3600, help="Interval between fetches in seconds (only for continuous mode)"
49 |     )
50 |     parser.add_argument(
51 |         "--log-file", help="Path to log file"
52 |     )
53 |     
54 |     args = parser.parse_args()
55 |     
56 |     # Configure logging
57 |     configure_logging(verbose=args.verbose, log_file=args.log_file)
58 |     logger = logging.getLogger(__name__)
59 |     
60 |     try:
61 |         if args.mode == "once":
62 |             asyncio.run(run_once(args))
63 |         elif args.mode == "continuous":
64 |             asyncio.run(run_continuous(args))
65 |     except KeyboardInterrupt:
66 |         logger.info("Interrupted by user")
67 |     except Exception as e:
68 |         logger.error(f"Error: {e}")
69 |         sys.exit(1)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203
4 | exclude = .git,__pycache__,build,dist
5 | 
6 | [isort]
7 | profile = black
8 | multi_line_output = 3


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | # Define test requirements
 4 | test_requirements = [
 5 |     "pytest>=7.0.0",
 6 |     "pytest-asyncio>=0.20.0",
 7 |     "pytest-cov>=4.0.0",
 8 | ]
 9 | 
10 | # Define development requirements
11 | dev_requirements = [
12 |     "black>=23.3.0",
13 |     "isort>=5.12.0",
14 |     "flake8>=6.0.0",
15 |     "flake8-docstrings>=1.7.0",
16 |     "pre-commit>=3.3.0",
17 | ] + test_requirements
18 | 
19 | setup(
20 |     name="telegraphite",
21 |     version="0.1.0",
22 |     packages=find_packages(),
23 |     install_requires=[
24 |         "telethon>=1.24.0",
25 |         "python-dotenv>=0.19.0",
26 |         "pyyaml>=6.0",
27 |     ],
28 |     extras_require={
29 |         "test": test_requirements,
30 |         "dev": dev_requirements,
31 |     },
32 |     entry_points={
33 |         "console_scripts": [
34 |             "telegraphite=telegraphite.cli:main",
35 |         ],
36 |     },
37 |     author="TeleGraphite Developer",
38 |     author_email="example@example.com",
39 |     description="A tool to fetch and save posts from public Telegram channels",
40 |     keywords="telegram, telethon, scraper",
41 |     python_requires=">=3.6",
42 | )


--------------------------------------------------------------------------------
/telegraphite/__init__.py:
--------------------------------------------------------------------------------
1 | """TeleGraphite - A tool to fetch and save posts from public Telegram channels."""
2 | 
3 | __version__ = "0.1.0"


--------------------------------------------------------------------------------
/telegraphite/__pycache__/__init__.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/__init__.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/__pycache__/cli.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/cli.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/__pycache__/client.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/client.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/__pycache__/contact_extractor.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/contact_extractor.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/__pycache__/errors.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/errors.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/__pycache__/fetcher.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/fetcher.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/__pycache__/logging_config.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/logging_config.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/__pycache__/store.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hamodywe/telegram-scraper-TeleGraphite/6a12afa61afa841dc2f45fb3fcb2fab5a1630c81/telegraphite/__pycache__/store.cpython-313.pyc


--------------------------------------------------------------------------------
/telegraphite/cli.py:
--------------------------------------------------------------------------------
  1 | """Command-line interface for TeleGraphite.
  2 | 
  3 | This module provides a command-line interface for fetching and saving posts from Telegram channels.
  4 | It handles command-line arguments, configuration, and execution of the fetcher.
  5 | """
  6 | 
  7 | import argparse
  8 | import asyncio
  9 | import logging
 10 | import os
 11 | import sys
 12 | import traceback
 13 | from pathlib import Path
 14 | 
 15 | from telegraphite.client import TelegramClientManager
 16 | from telegraphite.errors import AuthenticationError, ConfigurationError, FetchError
 17 | from telegraphite.fetcher import ChannelFetcher
 18 | from telegraphite.logging_config import configure_logging
 19 | from telegraphite.store import PostStore
 20 | 
 21 | 
 22 | def setup_logging(verbose: bool = False, log_file: str = None):
 23 |     """Set up logging configuration.
 24 | 
 25 |     Args:
 26 |         verbose: Whether to enable verbose logging.
 27 |         log_file: Optional path to a log file.
 28 |     """
 29 |     configure_logging(verbose=verbose, log_file=log_file)
 30 | 
 31 | 
 32 | async def run_once(args):
 33 |     """Run the fetcher once.
 34 | 
 35 |     Args:
 36 |         args: Command-line arguments.
 37 |         
 38 |     Raises:
 39 |         AuthenticationError: If there is an error with Telegram authentication.
 40 |         FetchError: If there is an error fetching posts.
 41 |     """
 42 |     logger = logging.getLogger(__name__)
 43 |     try:
 44 |         async with TelegramClientManager(args.env_file) as client:
 45 |             store = PostStore(args.data_dir)
 46 |             # Prepare filter options
 47 |             filters = {
 48 |                 "keywords": args.keywords or [],
 49 |                 "media_only": args.media_only,
 50 |                 "text_only": args.text_only,
 51 |             }
 52 |             
 53 |             # Prepare schedule options
 54 |             schedule = {
 55 |                 "days": args.days or [],
 56 |                 "times": args.times or [],
 57 |             }
 58 |             
 59 |             fetcher = ChannelFetcher(
 60 |                 client=client,
 61 |                 store=store,
 62 |                 channels_file=args.channels_file,
 63 |                 limit=args.limit,
 64 |                 filters=filters,
 65 |                 schedule=schedule,
 66 |                 contact_patterns_file=args.contact_patterns_file,
 67 |             )
 68 |             posts = await fetcher.fetch_all_channels()
 69 |             logger.info(f"Fetched {len(posts)} posts from channels")
 70 |             
 71 |             # Save posts
 72 |             success = await fetcher.fetch_and_save()
 73 |             if success:
 74 |                 logger.info("Successfully saved posts and media")
 75 |             else:
 76 |                 logger.error("Failed to save some posts or media")
 77 |                 
 78 |     except AuthenticationError as e:
 79 |         logger.error(f"Authentication error: {e}")
 80 |         raise
 81 |     except FetchError as e:
 82 |         logger.error(f"Error fetching posts: {e}")
 83 |         raise
 84 |     except Exception as e:
 85 |         logger.error(f"Unexpected error: {e}")
 86 |         logger.debug(traceback.format_exc())
 87 |         raise FetchError(f"Failed to fetch posts: {e}")
 88 | 
 89 | 
 90 | async def run_continuous(args):
 91 |     """Run the fetcher continuously with a specified interval.
 92 | 
 93 |     Args:
 94 |         args: Command-line arguments.
 95 |         
 96 |     Raises:
 97 |         KeyboardInterrupt: If the user interrupts the process.
 98 |     """
 99 |     logger = logging.getLogger(__name__)
100 |     logger.info(f"Running continuously with {args.interval} seconds interval")
101 |     
102 |     while True:
103 |         try:
104 |             # Check if we should run based on schedule
105 |             should_run = True
106 |             
107 |             # Check day of week if specified
108 |             if args.days:
109 |                 current_day = datetime.now().strftime("%A").lower()
110 |                 if current_day not in args.days:
111 |                     should_run = False
112 |                     logger.info(f"Skipping run on {current_day} (not in schedule)")
113 |             
114 |             # Check time of day if specified
115 |             if args.times and should_run:
116 |                 current_time = datetime.now().strftime("%H:%M")
117 |                 # Check if current time is close to any scheduled time (within 5 minutes)
118 |                 time_match = False
119 |                 for scheduled_time in args.times:
120 |                     scheduled_hour, scheduled_minute = map(int, scheduled_time.split(':'))
121 |                     current_hour, current_minute = map(int, current_time.split(':'))
122 |                     
123 |                     # Calculate difference in minutes
124 |                     scheduled_minutes = scheduled_hour * 60 + scheduled_minute
125 |                     current_minutes = current_hour * 60 + current_minute
126 |                     diff_minutes = abs(scheduled_minutes - current_minutes)
127 |                     
128 |                     if diff_minutes <= 5:  # Within 5 minutes of scheduled time
129 |                         time_match = True
130 |                         break
131 |                 
132 |                 if not time_match:
133 |                     should_run = False
134 |                     logger.info(f"Skipping run at {current_time} (not in schedule)")
135 |             
136 |             if should_run:
137 |                 await run_once(args)
138 |                 
139 |             logger.info(f"Sleeping for {args.interval} seconds...")
140 |             await asyncio.sleep(args.interval)
141 |         except KeyboardInterrupt:
142 |             logger.info("Interrupted by user")
143 |             break
144 |         except AuthenticationError as e:
145 |             logger.error(f"Authentication error: {e}")
146 |             logger.info("Waiting 60 seconds before retrying...")
147 |             await asyncio.sleep(60)  # Wait longer for auth errors
148 |         except FetchError as e:
149 |             logger.error(f"Error fetching posts: {e}")
150 |             logger.info("Waiting 30 seconds before retrying...")
151 |             await asyncio.sleep(30)
152 |         except Exception as e:
153 |             logger.error(f"Unexpected error: {e}")
154 |             logger.debug(traceback.format_exc())
155 |             logger.info("Waiting 10 seconds before retrying...")
156 |             await asyncio.sleep(10)
157 | 
158 | 
159 | def parse_args():
160 |     """Parse command-line arguments.
161 | 
162 |     Returns:
163 |         Parsed arguments.
164 |     """
165 |     parser = argparse.ArgumentParser(
166 |         description="Fetch and save posts from Telegram channels"
167 |     )
168 | 
169 |     parser.add_argument(
170 |         "-c",
171 |         "--channels-file",
172 |         default="channels.txt",
173 |         help="Path to file containing channel usernames (default: channels.txt)",
174 |     )
175 |     parser.add_argument(
176 |         "-d",
177 |         "--data-dir",
178 |         default="data",
179 |         help="Directory to store posts and media (default: data)",
180 |     )
181 |     parser.add_argument(
182 |         "-e",
183 |         "--env-file",
184 |         default=".env",
185 |         help="Path to .env file with API credentials (default: .env)",
186 |     )
187 |     parser.add_argument(
188 |         "-l",
189 |         "--limit",
190 |         type=int,
191 |         default=10,
192 |         help="Maximum number of posts to fetch per channel (default: 10)",
193 |     )
194 |     parser.add_argument(
195 |         "-v", "--verbose", action="store_true", help="Enable verbose logging"
196 |     )
197 |     parser.add_argument(
198 |         "--log-file",
199 |         help="Path to log file (logs will be written to this file in addition to console)",
200 |     )
201 |     parser.add_argument(
202 |         "--contact-patterns-file",
203 |         default="contact_patterns.txt",
204 |         help="Path to file containing email and phone patterns (default: contact_patterns.txt)",
205 |     )
206 |     parser.add_argument(
207 |         "--config",
208 |         help="Path to YAML configuration file",
209 |     )
210 |     
211 |     # Filter options
212 |     filter_group = parser.add_argument_group("filter options")
213 |     filter_group.add_argument(
214 |         "--keywords",
215 |         nargs="+",
216 |         help="Filter posts containing specific keywords",
217 |     )
218 |     filter_group.add_argument(
219 |         "--media-only",
220 |         action="store_true",
221 |         help="Only fetch posts containing media (photos, documents)",
222 |     )
223 |     filter_group.add_argument(
224 |         "--text-only",
225 |         action="store_true",
226 |         help="Only fetch posts containing text",
227 |     )
228 |     
229 |     # Schedule options
230 |     schedule_group = parser.add_argument_group("schedule options")
231 |     schedule_group.add_argument(
232 |         "--days",
233 |         nargs="+",
234 |         choices=["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"],
235 |         help="Days of the week to run the fetcher (for continuous mode)",
236 |     )
237 |     schedule_group.add_argument(
238 |         "--times",
239 |         nargs="+",
240 |         help="Times of day to run the fetcher in HH:MM format (for continuous mode)",
241 |     )
242 | 
243 |     subparsers = parser.add_subparsers(dest="command", help="Command to run")
244 | 
245 |     # Once command
246 |     once_parser = subparsers.add_parser(
247 |         "once", help="Fetch posts once and exit"
248 |     )
249 | 
250 |     # Continuous command
251 |     continuous_parser = subparsers.add_parser(
252 |         "continuous", help="Fetch posts continuously"
253 |     )
254 |     continuous_parser.add_argument(
255 |         "-i",
256 |         "--interval",
257 |         type=int,
258 |         default=3600,
259 |         help="Interval between fetches in seconds (default: 3600)",
260 |     )
261 | 
262 |     args = parser.parse_args()
263 | 
264 |     # Default to 'once' if no command is specified
265 |     if not args.command:
266 |         args.command = "once"
267 | 
268 |     return args
269 | 
270 | 
271 | def main():
272 |     """Main entry point for the command-line interface.
273 |     
274 |     Returns:
275 |         Exit code (0 for success, 1 for error).
276 |     """
277 |     try:
278 |         args = parse_args()
279 |         setup_logging(args.verbose, args.log_file)
280 |         logger = logging.getLogger(__name__)
281 |         
282 |         # Create data directory if it doesn't exist
283 |         Path(args.data_dir).mkdir(exist_ok=True, parents=True)
284 | 
285 |         # Check if channels file exists
286 |         if not os.path.exists(args.channels_file):
287 |             logger.error(f"Channels file not found: {args.channels_file}")
288 |             logger.info("Create a text file with one channel username per line.")
289 |             return 1
290 | 
291 |         # Check if .env file exists
292 |         if not os.path.exists(args.env_file):
293 |             logger.error(f".env file not found: {args.env_file}")
294 |             logger.info("Create a .env file with API_ID and API_HASH from https://my.telegram.org/")
295 |             return 1
296 |             
297 |         if args.command == "once":
298 |             try:
299 |                 asyncio.run(run_once(args))
300 |                 return 0
301 |             except (AuthenticationError, FetchError) as e:
302 |                 logger.error(str(e))
303 |                 return 1
304 |             except Exception as e:
305 |                 logger.error(f"Unexpected error: {e}")
306 |                 logger.debug(traceback.format_exc())
307 |                 return 1
308 |         elif args.command == "continuous":
309 |             try:
310 |                 asyncio.run(run_continuous(args))
311 |                 return 0
312 |             except KeyboardInterrupt:
313 |                 logger.info("Interrupted by user")
314 |                 return 0
315 |             except Exception as e:
316 |                 logger.error(f"Unexpected error: {e}")
317 |                 logger.debug(traceback.format_exc())
318 |                 return 1
319 |         else:
320 |             logger.error("No command specified. Use 'once' or 'continuous'.")
321 |             return 1
322 |     except Exception as e:
323 |         print(f"Error: {e}")
324 |         return 1
325 | 
326 | 
327 | if __name__ == "__main__":
328 |     sys.exit(main())
329 | 
330 | 
331 | if __name__ == "__main__":
332 |     main()


--------------------------------------------------------------------------------
/telegraphite/client.py:
--------------------------------------------------------------------------------
 1 | """Client module for TeleGraphite.
 2 | 
 3 | This module handles authentication and connection to Telegram using Telethon.
 4 | It provides a context manager for managing the Telegram client session.
 5 | """
 6 | 
 7 | import logging
 8 | import os
 9 | from pathlib import Path
10 | from typing import Optional
11 | 
12 | from dotenv import load_dotenv
13 | from telethon import TelegramClient
14 | from telethon.errors import ApiIdInvalidError, AuthKeyError
15 | 
16 | from telegraphite.errors import AuthenticationError
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | 
21 | class TelegramClientManager:
22 |     """Manages the Telegram client connection and authentication."""
23 | 
24 |     def __init__(self, env_path: Optional[str] = None):
25 |         """Initialize the Telegram client manager.
26 | 
27 |         Args:
28 |             env_path: Path to the .env file. If None, looks in the current directory.
29 |             
30 |         Raises:
31 |             AuthenticationError: If API credentials are missing or invalid.
32 |         """
33 |         # Load environment variables from .env file
34 |         env_path = env_path or Path(".env")
35 |         load_dotenv(env_path)
36 | 
37 |         # Get API credentials
38 |         self.api_id = os.getenv("API_ID")
39 |         self.api_hash = os.getenv("API_HASH")
40 | 
41 |         if not self.api_id or not self.api_hash:
42 |             logger.error("API_ID and API_HASH must be set in the .env file")
43 |             raise AuthenticationError(
44 |                 "API_ID and API_HASH must be set in the .env file. "
45 |                 "Get them from https://my.telegram.org/"
46 |             )
47 | 
48 |         logger.debug(f"Initialized TelegramClientManager with env file: {env_path}")
49 |         self.client = None
50 | 
51 |     async def start(self):
52 |         """Start the Telegram client session.
53 |         
54 |         Returns:
55 |             The Telegram client instance.
56 |             
57 |         Raises:
58 |             AuthenticationError: If there is an error with Telegram authentication.
59 |         """
60 |         try:
61 |             logger.info("Starting Telegram client session")
62 |             self.client = TelegramClient("telegraphite_session", self.api_id, self.api_hash)
63 |             await self.client.start()
64 |             logger.info("Telegram client session started successfully")
65 |             return self.client
66 |         except ApiIdInvalidError as e:
67 |             logger.error(f"Invalid API credentials: {e}")
68 |             raise AuthenticationError(f"Invalid API credentials: {e}") from e
69 |         except AuthKeyError as e:
70 |             logger.error(f"Authentication key error: {e}")
71 |             raise AuthenticationError(f"Authentication key error: {e}") from e
72 |         except Exception as e:
73 |             logger.error(f"Error starting Telegram client: {e}")
74 |             raise AuthenticationError(f"Failed to start Telegram client: {e}") from e
75 | 
76 |     async def stop(self):
77 |         """Stop the Telegram client session."""
78 |         if self.client:
79 |             await self.client.disconnect()
80 |             self.client = None
81 | 
82 |     async def __aenter__(self):
83 |         """Context manager entry point."""
84 |         return await self.start()
85 | 
86 |     async def __aexit__(self, exc_type, exc_val, exc_tb):
87 |         """Context manager exit point."""
88 |         await self.stop()


--------------------------------------------------------------------------------
/telegraphite/config.py:
--------------------------------------------------------------------------------
  1 | \"""Configuration module for TeleGraphite.
  2 | 
  3 | This module handles loading and validating configuration settings.
  4 | """
  5 | 
  6 | import logging
  7 | import os
  8 | from pathlib import Path
  9 | from typing import Dict, Optional, Union
 10 | 
 11 | import yaml
 12 | from dotenv import load_dotenv
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class Config:
 18 |     """Configuration manager for TeleGraphite."""
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         config_file: Optional[str] = None,
 23 |         env_file: Optional[str] = None,
 24 |         data_dir: Optional[str] = None,
 25 |         channels_file: Optional[str] = None,
 26 |     ):
 27 |         """Initialize the configuration manager.
 28 | 
 29 |         Args:
 30 |             config_file: Path to the YAML configuration file.
 31 |             env_file: Path to the .env file with API credentials.
 32 |             data_dir: Directory to store posts and media.
 33 |             channels_file: Path to the file containing channel usernames.
 34 |         """
 35 |         self.config_file = config_file
 36 |         self.env_file = env_file or ".env"
 37 |         self.config = {}
 38 | 
 39 |         # Load configuration from file if provided
 40 |         if config_file and os.path.exists(config_file):
 41 |             self._load_config_file()
 42 | 
 43 |         # Load environment variables
 44 |         load_dotenv(self.env_file)
 45 | 
 46 |         # Override with provided values
 47 |         if data_dir:
 48 |             self.config["data_dir"] = data_dir
 49 |         if channels_file:
 50 |             self.config["channels_file"] = channels_file
 51 | 
 52 |         # Set defaults if not provided
 53 |         self.config.setdefault("data_dir", "data")
 54 |         self.config.setdefault("channels_file", "channels.txt")
 55 |         self.config.setdefault("limit", 10)
 56 |         self.config.setdefault("interval", 3600)
 57 |         
 58 |         # Filters defaults
 59 |         self.config.setdefault("filters", {})
 60 |         self.config.get("filters").setdefault("keywords", [])
 61 |         self.config.get("filters").setdefault("media_only", False)
 62 |         self.config.get("filters").setdefault("text_only", False)
 63 |         
 64 |         # Schedule defaults
 65 |         self.config.setdefault("schedule", {})
 66 |         self.config.get("schedule").setdefault("days", [])
 67 |         self.config.get("schedule").setdefault("times", [])
 68 | 
 69 |         # Validate configuration
 70 |         self._validate_config()
 71 | 
 72 |     def _load_config_file(self) -> None:
 73 |         """Load configuration from YAML file."""
 74 |         try:
 75 |             with open(self.config_file, "r", encoding="utf-8") as f:
 76 |                 file_config = yaml.safe_load(f)
 77 |                 if file_config and isinstance(file_config, dict):
 78 |                     self.config.update(file_config)
 79 |                     logger.info(f"Loaded configuration from {self.config_file}")
 80 |                 else:
 81 |                     logger.warning(f"Invalid configuration in {self.config_file}")
 82 |         except Exception as e:
 83 |             logger.error(f"Error loading configuration from {self.config_file}: {e}")
 84 | 
 85 |     def _validate_config(self) -> None:
 86 |         """Validate the configuration."""
 87 |         # Check API credentials
 88 |         api_id = os.getenv("API_ID")
 89 |         api_hash = os.getenv("API_HASH")
 90 | 
 91 |         if not api_id or not api_hash:
 92 |             logger.warning(
 93 |                 "API_ID and API_HASH not found in environment variables. "
 94 |                 "These are required for connecting to Telegram."
 95 |             )
 96 | 
 97 |         # Check channels file
 98 |         channels_file = self.get("channels_file")
 99 |         if not os.path.exists(channels_file):
100 |             logger.warning(
101 |                 f"Channels file not found: {channels_file}. "
102 |                 "Create a text file with one channel username per line."
103 |             )
104 | 
105 |         # Create data directory if it doesn't exist
106 |         data_dir = Path(self.get("data_dir"))
107 |         data_dir.mkdir(exist_ok=True, parents=True)
108 | 
109 |     def get(self, key: str, default: Optional[Union[str, int, bool]] = None) -> Union[str, int, bool]:
110 |         """Get a configuration value.
111 | 
112 |         Args:
113 |             key: The configuration key.
114 |             default: Default value if key is not found.
115 | 
116 |         Returns:
117 |             The configuration value.
118 |         """
119 |         return self.config.get(key, default)
120 | 
121 |     def set(self, key: str, value: Union[str, int, bool]) -> None:
122 |         """Set a configuration value.
123 | 
124 |         Args:
125 |             key: The configuration key.
126 |             value: The configuration value.
127 |         """
128 |         self.config[key] = value
129 | 
130 |     def as_dict(self) -> Dict[str, Union[str, int, bool]]:
131 |         """Get the configuration as a dictionary.
132 | 
133 |         Returns:
134 |             The configuration dictionary.
135 |         """
136 |         return self.config.copy()


--------------------------------------------------------------------------------
/telegraphite/contact_extractor.py:
--------------------------------------------------------------------------------
 1 | """Contact Extractor module for TeleGraphite.
 2 | 
 3 | This module provides functionality to extract contact information such as emails and phone numbers from text.
 4 | """
 5 | 
 6 | import re
 7 | 
 8 | class ContactExtractor:
 9 |     def __init__(self, patterns_file: str = "contact_patterns.txt"):
10 |         self.patterns_file = patterns_file
11 |         self.email_pattern = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
12 |         self.phone_pattern = re.compile(r"\+?\d{1,4}?[-.\s]?\(?\d{1,3}?\)?[-.\s]?\d{1,4}[-.\s]?\d{1,4}[-.\s]?\d{1,9}")
13 | 
14 |     def extract_contacts(self, text: str) -> dict:
15 |         emails = self.email_pattern.findall(text)
16 |         phones = self.phone_pattern.findall(text)
17 |         return {"emails": emails, "phones": phones, "links": []}


--------------------------------------------------------------------------------
/telegraphite/errors.py:
--------------------------------------------------------------------------------
 1 | """Error handling module for TeleGraphite.
 2 | 
 3 | This module provides custom exceptions and error handling utilities.
 4 | """
 5 | 
 6 | import logging
 7 | from typing import Any, Callable, Optional, TypeVar
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | # Type variable for generic function return type
12 | T = TypeVar('T')
13 | 
14 | 
15 | class TeleGraphiteError(Exception):
16 |     """Base exception class for TeleGraphite."""
17 |     pass
18 | 
19 | 
20 | class ConfigurationError(TeleGraphiteError):
21 |     """Raised when there is an error in the configuration."""
22 |     pass
23 | 
24 | 
25 | class AuthenticationError(TeleGraphiteError):
26 |     """Raised when there is an error with Telegram authentication."""
27 |     pass
28 | 
29 | 
30 | class FetchError(TeleGraphiteError):
31 |     """Raised when there is an error fetching posts from Telegram."""
32 |     pass
33 | 
34 | 
35 | class StorageError(TeleGraphiteError):
36 |     """Raised when there is an error storing posts or media."""
37 |     pass
38 | 
39 | 
40 | def handle_errors(default_return: Optional[Any] = None) -> Callable:
41 |     """Decorator to handle exceptions in functions.
42 |     
43 |     Args:
44 |         default_return: Value to return if an exception occurs.
45 |         
46 |     Returns:
47 |         Decorated function that handles exceptions.
48 |     """
49 |     def decorator(func: Callable[..., T]) -> Callable[..., Optional[T]]:
50 |         def wrapper(*args, **kwargs) -> Optional[T]:
51 |             try:
52 |                 return func(*args, **kwargs)
53 |             except Exception as e:
54 |                 logger.error(f"Error in {func.__name__}: {str(e)}")
55 |                 return default_return
56 |         return wrapper
57 |     return decorator
58 | 
59 | 
60 | async def handle_async_errors(func: Callable, *args, **kwargs) -> Any:
61 |     """Handle exceptions in async functions.
62 |     
63 |     Args:
64 |         func: Async function to execute.
65 |         *args: Positional arguments to pass to the function.
66 |         **kwargs: Keyword arguments to pass to the function.
67 |         
68 |     Returns:
69 |         Result of the function or None if an exception occurs.
70 |         
71 |     Raises:
72 |         Exception: Re-raises any exception that occurs during execution.
73 |     """
74 |     try:
75 |         return await func(*args, **kwargs)
76 |     except Exception as e:
77 |         logger.error(f"Error in async function {func.__name__}: {str(e)}")
78 |         raise


--------------------------------------------------------------------------------
/telegraphite/fetcher.py:
--------------------------------------------------------------------------------
  1 | """Fetcher module for TeleGraphite.
  2 | 
  3 | This module handles fetching posts from Telegram channels.
  4 | """
  5 | 
  6 | import asyncio
  7 | import logging
  8 | from datetime import datetime
  9 | from pathlib import Path
 10 | from typing import Dict, List, Optional, Set
 11 | 
 12 | from telethon import TelegramClient
 13 | from telethon.tl.types import Message, MessageMediaPhoto, MessageMediaDocument
 14 | 
 15 | from telegraphite.store import PostStore
 16 | from telegraphite.contact_extractor import ContactExtractor
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | class ChannelFetcher:
 22 |     """Fetches posts from Telegram channels."""
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         client: TelegramClient,
 27 |         store: PostStore,
 28 |         channels_file: str = "channels.txt",
 29 |         limit: int = 10,
 30 |         filters: dict = None,
 31 |         schedule: dict = None,
 32 |         contact_patterns_file: str = "contact_patterns.txt",
 33 |     ):
 34 |         """Initialize the channel fetcher."""
 35 |         self.client = client
 36 |         self.store = store
 37 |         self.channels_file = Path(channels_file)
 38 |         self.limit = limit
 39 |         self.existing_post_ids: Set[int] = set()
 40 |         self.contact_extractor = ContactExtractor(contact_patterns_file)
 41 | 
 42 |         # Initialize filters
 43 |         self.filters = filters or {}
 44 |         self.filters.setdefault("keywords", [])
 45 |         self.filters.setdefault("media_only", False)
 46 |         self.filters.setdefault("text_only", False)
 47 | 
 48 |         # Initialize schedule
 49 |         self.schedule = schedule or {}
 50 |         self.schedule.setdefault("days", [])
 51 |         self.schedule.setdefault("times", [])
 52 | 
 53 |     def _load_channels(self) -> List[str]:
 54 |         if not self.channels_file.exists():
 55 |             logger.warning(f"Channels file not found: {self.channels_file}")
 56 |             return []
 57 | 
 58 |         with open(self.channels_file, "r", encoding="utf-8") as f:
 59 |             channels = [line.strip() for line in f if line.strip()]
 60 | 
 61 |         logger.info(f"Loaded {len(channels)} channels from {self.channels_file}")
 62 |         return channels
 63 | 
 64 |     async def _fetch_channel_posts(self, channel: str) -> List[Dict]:
 65 |         logger.info(f"Fetching posts from channel: {channel}")
 66 |         posts = []
 67 | 
 68 |         try:
 69 |             self.existing_post_ids = self.store.get_existing_post_ids(channel)
 70 | 
 71 |             async for message in self.client.iter_messages(channel, limit=self.limit):
 72 |                 if not isinstance(message, Message):
 73 |                     continue
 74 |                 if message.id in self.existing_post_ids:
 75 |                     logger.debug(f"Skipping already saved post: {message.id}")
 76 |                     continue
 77 | 
 78 |                 post = await self._process_message(channel, message)
 79 |                 if post:
 80 |                     posts.append(post)
 81 | 
 82 |         except Exception as e:
 83 |             logger.error(f"Error fetching posts from {channel}: {e}")
 84 | 
 85 |         return posts
 86 | 
 87 |     async def _process_message(self, channel: str, message: Message) -> Optional[Dict]:
 88 |         """Process a message and extract relevant information."""
 89 |         try:
 90 |             date_iso = message.date.strftime("%Y-%m-%dT%H:%M:%SZ")
 91 |             text = message.text or message.message or ""
 92 | 
 93 |             # Filters
 94 |             if self.filters["media_only"] and not message.media:
 95 |                 logger.debug(f"Skipping post {message.id} (no media)")
 96 |                 return None
 97 | 
 98 |             if self.filters["text_only"] and not text.strip():
 99 |                 logger.debug(f"Skipping post {message.id} (no text)")
100 |                 return None
101 | 
102 |             if self.filters["keywords"]:
103 |                 has_keyword = any(keyword.lower() in text.lower() for keyword in self.filters["keywords"])
104 |                 if not has_keyword:
105 |                     logger.debug(f"Skipping post {message.id} (no matching keywords)")
106 |                     return None
107 |             
108 |             # Extract contact information (emails and phone numbers)
109 |             contacts = self.contact_extractor.extract_contacts(text)
110 | 
111 |             # Save media only if text_only is False
112 |             media_info = []
113 |             if message.media and not self.filters["text_only"]:
114 |                 if isinstance(message.media, (MessageMediaPhoto, MessageMediaDocument)):
115 |                     media_info = await self.store.save_media(channel, message)
116 | 
117 |             post = {
118 |                 "channel_name": channel.lstrip("@"),
119 |                 "post_id": message.id,
120 |                 "timestamp": date_iso,
121 |                 "text": text,
122 |                 "media": media_info,
123 |                 "image_paths": [m.get("path") for m in media_info] if media_info else [],
124 |                 "source_channel": channel,
125 |                 "post_type": "media" if media_info else "text",
126 |                 "fetch_date": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"),
127 |                 "has_media": bool(media_info),
128 |                 "media_count": len(media_info),
129 |                 "emails": contacts["emails"],
130 |                 "phones": contacts["phones"],
131 |                 "links": contacts["links"],
132 |             }
133 | 
134 |             content_parts = []
135 |             if text:
136 |                 content_parts.append(text.strip())
137 |             for media in media_info:
138 |                 if media.get("path"):
139 |                     content_parts.append(str(media["path"]).strip())
140 | 
141 |             if content_parts:
142 |                 import hashlib
143 |                 content_string = "|".join(content_parts)
144 |                 post["content_hash"] = hashlib.md5(content_string.encode("utf-8")).hexdigest()
145 | 
146 |             return post
147 | 
148 |         except Exception as e:
149 |             logger.error(f"Error processing message {message.id}: {e}")
150 |             return None
151 | 
152 |     async def fetch_all_channels(self) -> List[Dict]:
153 |         channels = self._load_channels()
154 |         all_posts = []
155 | 
156 |         for channel in channels:
157 |             channel_posts = await self._fetch_channel_posts(channel)
158 |             all_posts.extend(channel_posts)
159 |             if channel_posts:
160 |                 self.store.save_posts(channel_posts)
161 | 
162 |         return all_posts
163 | 
164 |     async def fetch_and_save(self) -> bool:
165 |         channels = self._load_channels()
166 |         all_posts = []
167 |         new_post_count = 0
168 |         error_count = 0
169 | 
170 |         for channel in channels:
171 |             try:
172 |                 channel_posts = await self._fetch_channel_posts(channel)
173 |                 if channel_posts:
174 |                     new_post_count += len(channel_posts)
175 |                     all_posts.extend(channel_posts)
176 |                     success = self.store.save_posts(channel_posts)
177 |                     if not success:
178 |                         logger.error(f"Failed to save posts for channel {channel}")
179 |                         error_count += 1
180 |                     else:
181 |                         logger.info(f"Successfully saved {len(channel_posts)} new posts from {channel}")
182 |             except Exception as e:
183 |                 logger.error(f"Error processing channel {channel}: {e}")
184 |                 error_count += 1
185 | 
186 |         logger.info(f"Fetch and save complete. Added {new_post_count} new posts from {len(channels)} channels")
187 |         if error_count > 0:
188 |             logger.warning(f"Encountered errors with {error_count} channels")
189 | 
190 |         return error_count == 0
191 | 
192 |     async def run_periodic(self, interval_seconds: int):
193 |         while True:
194 |             logger.info(f"Starting periodic fetch (interval: {interval_seconds}s)")
195 |             if self._should_run_now():
196 |                 try:
197 |                     success = await self.fetch_and_save()
198 |                     if success:
199 |                         logger.info("Periodic fetch completed successfully")
200 |                     else:
201 |                         logger.warning("Periodic fetch completed with some errors")
202 |                 except Exception as e:
203 |                     logger.error(f"Error during periodic fetch: {e}")
204 |             else:
205 |                 logger.info("Skipping fetch based on schedule configuration")
206 | 
207 |             logger.info(f"Waiting {interval_seconds} seconds until next run")
208 |             await asyncio.sleep(interval_seconds)
209 | 
210 |     def _should_run_now(self) -> bool:
211 |         if not self.schedule or (not self.schedule.get("days") and not self.schedule.get("times")):
212 |             return True
213 | 
214 |         now = datetime.now()
215 | 
216 |         if self.schedule.get("days"):
217 |             weekday = now.strftime("%A").lower()
218 |             if weekday not in [day.lower() for day in self.schedule["days"]]:
219 |                 logger.debug(f"Not running: today ({weekday}) is not in scheduled days {self.schedule['days']}")
220 |                 return False
221 | 
222 |         if self.schedule.get("times"):
223 |             current_time = now.strftime("%H:%M")
224 |             for time_range in self.schedule["times"]:
225 |                 if isinstance(time_range, str) and current_time == time_range:
226 |                     return True
227 |                 elif isinstance(time_range, dict) and "start" in time_range and "end" in time_range:
228 |                     if time_range["start"] <= current_time <= time_range["end"]:
229 |                         return True
230 | 
231 |             logger.debug(f"Not running: current time ({current_time}) is not in scheduled times {self.schedule['times']}")
232 |             return False
233 | 
234 |         return True
235 | 


--------------------------------------------------------------------------------
/telegraphite/logging_config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Logging configuration module for TeleGraphite.
  3 | 
  4 | This module provides centralized logging configuration for the application.
  5 | """
  6 | 
  7 | import logging
  8 | from pathlib import Path
  9 | from typing import Optional
 10 | 
 11 | 
 12 | def configure_logging(verbose: bool = False, log_file: Optional[str] = None) -> logging.Logger:
 13 |     """
 14 |     Configure logging for the application.
 15 | 
 16 |     Args:
 17 |         verbose (bool): Whether to enable verbose (DEBUG) logging.
 18 |         log_file (Optional[str]): Optional path to a log file. If provided, logs will be written to this file
 19 |             in addition to the console.
 20 | 
 21 |     Returns:
 22 |         logging.Logger: The configured root logger.
 23 |     """
 24 |     log_level = logging.DEBUG if verbose else logging.INFO
 25 |     log_format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
 26 |     detailed_log_format = "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s"
 27 |     
 28 |     # Configure root logger
 29 |     root_logger = logging.getLogger()
 30 |     root_logger.setLevel(log_level)
 31 |     
 32 |     # Remove existing handlers to avoid duplicate logs
 33 |     for handler in root_logger.handlers[:]:
 34 |         root_logger.removeHandler(handler)
 35 |     
 36 |     # Create and add console handler
 37 |     console_handler = logging.StreamHandler()
 38 |     console_handler.setLevel(log_level)
 39 |     console_handler.setFormatter(logging.Formatter(log_format))
 40 |     root_logger.addHandler(console_handler)
 41 |     
 42 |     # Create file handlers if log_file is provided
 43 |     if log_file:
 44 |         log_path = Path(log_file)
 45 |         # Create directory if it doesn't exist
 46 |         log_path.parent.mkdir(parents=True, exist_ok=True)
 47 |         
 48 |         # File handler for detailed log output
 49 |         file_handler = logging.FileHandler(log_path, mode='a', encoding='utf-8')
 50 |         file_handler.setLevel(log_level)
 51 |         file_handler.setFormatter(logging.Formatter(detailed_log_format))
 52 |         root_logger.addHandler(file_handler)
 53 |         
 54 |         # Error-specific file handler for ERROR and above messages
 55 |         error_log_path = log_path.with_name(f"{log_path.stem}_errors{log_path.suffix}")
 56 |         error_handler = logging.FileHandler(error_log_path, mode='a', encoding='utf-8')
 57 |         error_handler.setLevel(logging.ERROR)
 58 |         error_handler.setFormatter(logging.Formatter(detailed_log_format))
 59 |         root_logger.addHandler(error_handler)
 60 |     
 61 |     # Suppress verbose logging from third-party libraries
 62 |     logging.getLogger('telethon').setLevel(logging.INFO)
 63 |     logging.getLogger('asyncio').setLevel(logging.INFO)
 64 |     
 65 |     return root_logger
 66 | 
 67 | 
 68 | def configure_post_logger(log_dir: Optional[str] = None) -> logging.Logger:
 69 |     """
 70 |     Configure a specialized logger for tracking post fetching and media downloads.
 71 |     
 72 |     This creates a separate logger that tracks new posts, updates, and media downloads
 73 |     with channel-specific information.
 74 |     
 75 |     Args:
 76 |         log_dir (Optional[str]): Directory to store post logs. If None, logs will only be output to the console.
 77 |         
 78 |     Returns:
 79 |         logging.Logger: The configured post logger.
 80 |     """
 81 |     post_logger = logging.getLogger('telegraphite.posts')
 82 |     post_logger.setLevel(logging.INFO)
 83 |     
 84 |     post_log_format = "%(asctime)s - %(levelname)s - [%(channel)s] - %(message)s"
 85 |     post_formatter = logging.Formatter(post_log_format)
 86 |     
 87 |     # Remove any existing handlers to avoid duplicates
 88 |     for handler in post_logger.handlers[:]:
 89 |         post_logger.removeHandler(handler)
 90 |     
 91 |     # Create console handler for the post logger
 92 |     console_handler = logging.StreamHandler()
 93 |     console_handler.setLevel(logging.INFO)
 94 |     console_handler.setFormatter(post_formatter)
 95 |     post_logger.addHandler(console_handler)
 96 |     
 97 |     # Create file handlers if log_dir is provided
 98 |     if log_dir:
 99 |         log_path = Path(log_dir)
100 |         log_path.mkdir(parents=True, exist_ok=True)
101 |         
102 |         # Handler for logging all post-related messages
103 |         posts_log_file = log_path / "posts.log"
104 |         file_handler = logging.FileHandler(posts_log_file, mode='a', encoding='utf-8')
105 |         file_handler.setLevel(logging.INFO)
106 |         file_handler.setFormatter(post_formatter)
107 |         post_logger.addHandler(file_handler)
108 |         
109 |         # Handler for media-specific log messages only
110 |         media_log_file = log_path / "media.log"
111 |         media_handler = logging.FileHandler(media_log_file, mode='a', encoding='utf-8')
112 |         media_handler.setLevel(logging.INFO)
113 |         media_handler.setFormatter(post_formatter)
114 |         media_handler.addFilter(lambda record: 'media' in record.getMessage().lower())
115 |         post_logger.addHandler(media_handler)
116 |     
117 |     return post_logger
118 | 


--------------------------------------------------------------------------------
/telegraphite/store.py:
--------------------------------------------------------------------------------
  1 | """Store module for TeleGraphite.
  2 | 
  3 | This module handles storing posts and media files from Telegram channels.
  4 | """
  5 | 
  6 | import hashlib
  7 | import json
  8 | import logging
  9 | import os
 10 | from datetime import datetime
 11 | from pathlib import Path
 12 | from typing import Dict, List, Optional, Set, Any
 13 | 
 14 | from telethon.tl.types import Message, MessageMediaPhoto, MessageMediaDocument
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | class PostStore:
 20 |     """Stores posts and media files from Telegram channels."""
 21 | 
 22 |     def __init__(self, data_dir: str = "data"):
 23 |         """Initialize the post store.
 24 | 
 25 |         Args:
 26 |             data_dir: Directory to store posts and media files.
 27 |         """
 28 |         self.data_dir = Path(data_dir)
 29 |         self.data_dir.mkdir(exist_ok=True, parents=True)
 30 | 
 31 |     def get_channel_dir(self, channel: str) -> Path:
 32 |         """Get the directory for a channel.
 33 | 
 34 |         Args:
 35 |             channel: The channel username.
 36 | 
 37 |         Returns:
 38 |             Path to the channel directory.
 39 |         """
 40 |         # Remove @ if present and create directory
 41 |         channel_name = channel.lstrip("@")
 42 |         channel_dir = self.data_dir / channel_name
 43 |         channel_dir.mkdir(exist_ok=True)
 44 |         return channel_dir
 45 | 
 46 |     def get_existing_post_ids(self, channel: str) -> Set[int]:
 47 |         """Get IDs of existing posts for a channel.
 48 | 
 49 |         Args:
 50 |             channel: The channel username.
 51 | 
 52 |         Returns:
 53 |             Set of existing post IDs.
 54 |         """
 55 |         channel_dir = self.get_channel_dir(channel)
 56 |         posts_file = channel_dir / "posts.json"
 57 |         
 58 |         if not posts_file.exists():
 59 |             return set()
 60 |         
 61 |         try:
 62 |             with open(posts_file, "r", encoding="utf-8") as f:
 63 |                 posts = json.load(f)
 64 |                 return {post.get("post_id") for post in posts if post.get("post_id")}
 65 |         except (json.JSONDecodeError, FileNotFoundError) as e:
 66 |             logger.error(f"Error loading existing posts for {channel}: {e}")
 67 |             return set()
 68 |             
 69 |     def _validate_post(self, post: Dict[str, Any]) -> bool:
 70 |         """Validate that a post has all required fields.
 71 |         
 72 |         Args:
 73 |             post: The post dictionary to validate.
 74 |             
 75 |         Returns:
 76 |             True if the post is valid, False otherwise.
 77 |         """
 78 |         # Required fields for a valid post
 79 |         required_fields = ["post_id"]
 80 |         
 81 |         # Check required fields
 82 |         for field in required_fields:
 83 |             if field not in post or post[field] is None:
 84 |                 logger.warning(f"Post missing required field: {field}")
 85 |                 return False
 86 |                 
 87 |         # Ensure post has a channel name
 88 |         if not (post.get("channel_name") or post.get("channel") or post.get("source_channel")):
 89 |             logger.warning("Post missing channel information")
 90 |             return False
 91 |             
 92 |         # If post doesn't have a timestamp, add current time
 93 |         if not post.get("timestamp"):
 94 |             post["timestamp"] = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
 95 |             
 96 |         # Calculate content hash if not present
 97 |         if not post.get("content_hash"):
 98 |             post["content_hash"] = self._calculate_content_hash(post)
 99 |             
100 |         return True
101 |         
102 |     def _calculate_content_hash(self, post: Dict[str, Any]) -> str:
103 |         """Calculate a hash of the post content for deduplication.
104 |         
105 |         Args:
106 |             post: The post dictionary.
107 |             
108 |         Returns:
109 |             A hash string representing the post content.
110 |         """
111 |         # Create a string with the most important content
112 |         content_parts = []
113 |         
114 |         # Add text content if available
115 |         if post.get("text"):
116 |             content_parts.append(str(post["text"]).strip())
117 |             
118 |         # Add media paths if available
119 |         if post.get("media"):
120 |             for media in post["media"]:
121 |                 if media.get("path"):
122 |                     content_parts.append(str(media["path"]).strip())
123 |         elif post.get("image_paths"):
124 |             for path in post["image_paths"]:
125 |                 content_parts.append(str(path).strip())
126 |                 
127 |         # If no content parts, use post_id
128 |         if not content_parts and post.get("post_id"):
129 |             content_parts.append(str(post["post_id"]))
130 |             
131 |         # Join all parts and create hash
132 |         content_string = "|".join(content_parts)
133 |         return hashlib.md5(content_string.encode("utf-8")).hexdigest()
134 | 
135 |     def save_posts(self, posts: List[Dict]) -> bool:
136 |         """Save posts to JSON files.
137 | 
138 |         Args:
139 |             posts: List of post dictionaries.
140 | 
141 |         Returns:
142 |             True if successful, False otherwise.
143 |         """
144 |         if not posts:
145 |             return True
146 | 
147 |         # Group posts by channel
148 |         posts_by_channel: Dict[str, List[Dict]] = {}
149 |         for post in posts:
150 |             # Support both old and new post structure
151 |             channel = post.get("source_channel") or post.get("channel") or post.get("channel_name")
152 |             if not channel:
153 |                 logger.warning(f"Skipping post without channel information: {post.get('post_id')}")
154 |                 continue
155 |                 
156 |             # Normalize channel name (remove @ prefix)
157 |             channel = channel.lstrip("@")
158 |             
159 |             # Ensure post has all required fields
160 |             if not self._validate_post(post):
161 |                 logger.warning(f"Skipping invalid post for channel {channel}: {post.get('post_id')}")
162 |                 continue
163 |                 
164 |             if channel not in posts_by_channel:
165 |                 posts_by_channel[channel] = []
166 |             posts_by_channel[channel].append(post)
167 | 
168 |         # Save posts for each channel
169 |         success = True
170 |         for channel, channel_posts in posts_by_channel.items():
171 |             channel_dir = self.get_channel_dir(channel)
172 |             posts_file = channel_dir / "posts.json"
173 |             
174 |             # Load existing posts if any
175 |             existing_posts = []
176 |             if posts_file.exists():
177 |                 try:
178 |                     with open(posts_file, "r", encoding="utf-8") as f:
179 |                         existing_posts = json.load(f)
180 |                 except (json.JSONDecodeError, FileNotFoundError) as e:
181 |                     logger.error(f"Error loading existing posts for {channel}: {e}")
182 |                     logger.info(f"Creating new posts file for {channel}")
183 |             
184 |             # Enhanced deduplication: check by post_id and content hash
185 |             existing_post_ids = {post.get("post_id") for post in existing_posts if post.get("post_id")}
186 |             existing_content_hashes = {post.get("content_hash") for post in existing_posts if post.get("content_hash")}
187 |             
188 |             # Track how many new posts were added
189 |             new_posts_count = 0
190 |             updated_posts_count = 0
191 |             
192 |             for post in channel_posts:
193 |                 post_id = post.get("post_id")
194 |                 content_hash = post.get("content_hash")
195 |                 
196 |                 # Check if this is a new post or an update to an existing post
197 |                 is_new_post = post_id and post_id not in existing_post_ids
198 |                 is_duplicate_content = content_hash and content_hash in existing_content_hashes
199 |                 
200 |                 if is_new_post:
201 |                     # Add new post
202 |                     existing_posts.append(post)
203 |                     existing_post_ids.add(post_id)
204 |                     if content_hash:
205 |                         existing_content_hashes.add(content_hash)
206 |                     new_posts_count += 1
207 |                     logger.debug(f"Added new post {post_id} for channel {channel}")
208 |                 elif not is_duplicate_content and post_id in existing_post_ids:
209 |                     # Update existing post (content changed)
210 |                     for i, existing_post in enumerate(existing_posts):
211 |                         if existing_post.get("post_id") == post_id:
212 |                             existing_posts[i] = post
213 |                             updated_posts_count += 1
214 |                             logger.debug(f"Updated existing post {post_id} for channel {channel}")
215 |                             break
216 |             
217 |             # Save all posts
218 |             try:
219 |                 with open(posts_file, "w", encoding="utf-8") as f:
220 |                     json.dump(existing_posts, f, ensure_ascii=False, indent=2)
221 |                 logger.info(f"Saved {new_posts_count} new posts and updated {updated_posts_count} posts for channel {channel} (total: {len(existing_posts)})")
222 |             except Exception as e:
223 |                 logger.error(f"Error saving posts for {channel}: {e}")
224 |                 success = False
225 | 
226 |         return success
227 | 
228 |     async def save_media(self, channel: str, message: Message) -> List[Dict]:
229 |         """Save media files from a message.
230 | 
231 |         Args:
232 |             channel: The channel username.
233 |             message: The Telegram message.
234 | 
235 |         Returns:
236 |             List of dictionaries containing media information including path and metadata.
237 |         """
238 |         if not message.media:
239 |             return []
240 | 
241 |         # Normalize channel name (remove @ prefix)
242 |         channel_name = channel.lstrip("@")
243 |         channel_dir = self.get_channel_dir(channel_name)
244 |         
245 |         # Create year/month based directory structure for better organization
246 |         date = message.date
247 |         year_month_dir = f"{date.year:04d}/{date.month:02d}"
248 |         media_dir = channel_dir / "media" / year_month_dir
249 |         media_dir.mkdir(exist_ok=True, parents=True)
250 | 
251 |         saved_media = []
252 |         try:
253 |             # Generate a filename based on message ID and date
254 |             date_str = date.strftime("%Y%m%d_%H%M%S")
255 |             filename_base = f"{date_str}_{message.id}"
256 |             
257 |             # Download the media file
258 |             if isinstance(message.media, (MessageMediaPhoto, MessageMediaDocument)):
259 |                 file_path = media_dir / f"{filename_base}"
260 |                 downloaded_path = await message.download_media(file=str(file_path))
261 |                 
262 |                 if downloaded_path:
263 |                     # Convert to relative path for storage
264 |                     rel_path = os.path.relpath(downloaded_path, start=str(channel_dir))
265 |                     abs_path = os.path.abspath(downloaded_path)
266 |                     
267 |                     # Add media metadata to make it easier for external applications
268 |                     media_type = "photo" if isinstance(message.media, MessageMediaPhoto) else "document"
269 |                     file_ext = os.path.splitext(downloaded_path)[1].lstrip('.')
270 |                     file_size = os.path.getsize(downloaded_path) if os.path.exists(downloaded_path) else 0
271 |                     
272 |                     # Create a detailed media info dictionary
273 |                     media_info = {
274 |                         "path": rel_path,
275 |                         "absolute_path": abs_path,
276 |                         "type": media_type,
277 |                         "format": file_ext,
278 |                         "size": file_size,
279 |                         "filename": os.path.basename(downloaded_path),
280 |                         "timestamp": message.date.strftime("%Y-%m-%dT%H:%M:%SZ"),
281 |                         "year": date.year,
282 |                         "month": date.month,
283 |                         "day": date.day,
284 |                         "channel": channel_name,
285 |                         "message_id": message.id,
286 |                         "media_id": hashlib.md5(rel_path.encode("utf-8")).hexdigest()[:10]
287 |                     }
288 |                     
289 |                     saved_media.append(media_info)
290 |                     logger.info(f"Saved media from post {message.id} to {rel_path}")
291 |                     logger.debug(f"Media details: type={media_type}, size={file_size} bytes, format={file_ext}")
292 |         
293 |         except Exception as e:
294 |             logger.error(f"Error saving media for message {message.id}: {e}")
295 |         
296 |         return saved_media


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test package for TeleGraphite."""


--------------------------------------------------------------------------------
/tests/test_fetcher.py:
--------------------------------------------------------------------------------
  1 | \"""Tests for the fetcher module.
  2 | 
  3 | This module contains tests for the ChannelFetcher class.
  4 | """
  5 | 
  6 | import asyncio
  7 | import os
  8 | import tempfile
  9 | import unittest
 10 | from pathlib import Path
 11 | from unittest.mock import AsyncMock, MagicMock, patch
 12 | 
 13 | from telegraphite.fetcher import ChannelFetcher
 14 | from telegraphite.store import PostStore
 15 | 
 16 | 
 17 | class TestChannelFetcher(unittest.TestCase):
 18 |     """Test cases for the ChannelFetcher class."""
 19 | 
 20 |     def setUp(self):
 21 |         """Set up test environment before each test."""
 22 |         # Create a temporary directory for test data
 23 |         self.temp_dir = tempfile.mkdtemp()
 24 |         self.store = PostStore(data_dir=self.temp_dir)
 25 |         
 26 |         # Create a mock client
 27 |         self.mock_client = MagicMock()
 28 |         
 29 |         # Create a temporary channels file
 30 |         self.channels_file = Path(self.temp_dir) / "channels.txt"
 31 |         with open(self.channels_file, "w", encoding="utf-8") as f:
 32 |             f.write("@test_channel1\ntest_channel2")
 33 |         
 34 |         # Create the fetcher
 35 |         self.fetcher = ChannelFetcher(
 36 |             client=self.mock_client,
 37 |             store=self.store,
 38 |             channels_file=str(self.channels_file),
 39 |             limit=10,
 40 |         )
 41 | 
 42 |     def tearDown(self):
 43 |         """Clean up after each test."""
 44 |         # Remove the temporary directory and its contents
 45 |         import shutil
 46 |         shutil.rmtree(self.temp_dir)
 47 | 
 48 |     def test_load_channels(self):
 49 |         """Test loading channels from the channels file."""
 50 |         channels = self.fetcher._load_channels()
 51 |         self.assertEqual(len(channels), 2)
 52 |         self.assertEqual(channels, ["@test_channel1", "test_channel2"])
 53 | 
 54 |     def test_load_channels_file_not_found(self):
 55 |         """Test loading channels when the file doesn't exist."""
 56 |         # Remove the channels file
 57 |         os.remove(self.channels_file)
 58 |         
 59 |         # Try to load channels
 60 |         channels = self.fetcher._load_channels()
 61 |         self.assertEqual(channels, [])
 62 | 
 63 |     @patch("telegraphite.store.PostStore.get_existing_post_ids")
 64 |     @patch("telegraphite.fetcher.ChannelFetcher._process_message")
 65 |     async def test_fetch_channel_posts(self, mock_process_message, mock_get_existing_post_ids):
 66 |         """Test fetching posts from a channel."""
 67 |         # Mock the existing post IDs
 68 |         mock_get_existing_post_ids.return_value = {1, 2}
 69 |         
 70 |         # Create mock messages
 71 |         mock_message1 = MagicMock(id=1)  # Already exists
 72 |         mock_message2 = MagicMock(id=3)  # New message
 73 |         mock_message3 = MagicMock(id=4)  # New message
 74 |         
 75 |         # Set up the client to return the mock messages
 76 |         self.mock_client.iter_messages = AsyncMock(return_value=[mock_message1, mock_message2, mock_message3])
 77 |         
 78 |         # Mock the process_message method to return post data
 79 |         mock_process_message.side_effect = [
 80 |             None,  # Skip message 1 (already exists)
 81 |             {"channel": "test_channel", "post_id": 3, "text": "Post 3"},
 82 |             {"channel": "test_channel", "post_id": 4, "text": "Post 4"},
 83 |         ]
 84 |         
 85 |         # Fetch posts
 86 |         posts = await self.fetcher._fetch_channel_posts("test_channel")
 87 |         
 88 |         # Verify results
 89 |         self.assertEqual(len(posts), 2)  # Should have 2 new posts
 90 |         self.assertEqual(posts[0]["post_id"], 3)
 91 |         self.assertEqual(posts[1]["post_id"], 4)
 92 |         
 93 |         # Verify that existing posts were skipped
 94 |         mock_process_message.assert_called_with("test_channel", mock_message3)
 95 |         self.assertEqual(mock_process_message.call_count, 2)  # Called for message2 and message3
 96 | 
 97 |     @patch("telegraphite.fetcher.ChannelFetcher._fetch_channel_posts")
 98 |     async def test_fetch_all_channels(self, mock_fetch_channel_posts):
 99 |         """Test fetching posts from all channels."""
100 |         # Mock the fetch_channel_posts method
101 |         mock_fetch_channel_posts.side_effect = [
102 |             [{"channel": "test_channel1", "post_id": 1, "text": "Post 1"}],
103 |             [{"channel": "test_channel2", "post_id": 2, "text": "Post 2"}],
104 |         ]
105 |         
106 |         # Fetch all channels
107 |         posts = await self.fetcher.fetch_all_channels()
108 |         
109 |         # Verify results
110 |         self.assertEqual(len(posts), 2)
111 |         self.assertEqual(posts[0]["channel"], "test_channel1")
112 |         self.assertEqual(posts[1]["channel"], "test_channel2")
113 |         
114 |         # Verify that fetch_channel_posts was called for each channel
115 |         self.assertEqual(mock_fetch_channel_posts.call_count, 2)
116 |         mock_fetch_channel_posts.assert_any_call("@test_channel1")
117 |         mock_fetch_channel_posts.assert_any_call("test_channel2")
118 | 
119 |     @patch("telegraphite.store.PostStore.save_posts")
120 |     @patch("telegraphite.fetcher.ChannelFetcher._fetch_channel_posts")
121 |     async def test_fetch_and_save(self, mock_fetch_channel_posts, mock_save_posts):
122 |         """Test fetching and saving posts."""
123 |         # Mock the fetch_channel_posts method
124 |         mock_fetch_channel_posts.side_effect = [
125 |             [{"channel": "test_channel1", "post_id": 1, "text": "Post 1"}],
126 |             [{"channel": "test_channel2", "post_id": 2, "text": "Post 2"}],
127 |         ]
128 |         
129 |         # Mock the save_posts method
130 |         mock_save_posts.return_value = True
131 |         
132 |         # Fetch and save posts
133 |         result = await self.fetcher.fetch_and_save()
134 |         
135 |         # Verify results
136 |         self.assertTrue(result)
137 |         mock_save_posts.assert_called_once()
138 |         self.assertEqual(len(mock_save_posts.call_args[0][0]), 2)  # Called with 2 posts
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_store.py:
--------------------------------------------------------------------------------
  1 | \"""Tests for the store module.
  2 | 
  3 | This module contains tests for the PostStore class, focusing on deduplication logic.
  4 | """
  5 | 
  6 | import json
  7 | import os
  8 | import shutil
  9 | import tempfile
 10 | import unittest
 11 | from pathlib import Path
 12 | from unittest.mock import patch
 13 | 
 14 | from telegraphite.store import PostStore
 15 | 
 16 | 
 17 | class TestPostStore(unittest.TestCase):
 18 |     """Test cases for the PostStore class."""
 19 | 
 20 |     def setUp(self):
 21 |         """Set up test environment before each test."""
 22 |         # Create a temporary directory for test data
 23 |         self.temp_dir = tempfile.mkdtemp()
 24 |         self.store = PostStore(data_dir=self.temp_dir)
 25 |         self.test_channel = "test_channel"
 26 |         
 27 |         # Create test channel directory
 28 |         self.channel_dir = Path(self.temp_dir) / self.test_channel
 29 |         self.channel_dir.mkdir(exist_ok=True)
 30 |         
 31 |         # Sample posts for testing
 32 |         self.sample_posts = [
 33 |             {"channel": self.test_channel, "post_id": 1, "text": "Post 1"},
 34 |             {"channel": self.test_channel, "post_id": 2, "text": "Post 2"},
 35 |             {"channel": self.test_channel, "post_id": 3, "text": "Post 3"},
 36 |         ]
 37 | 
 38 |     def tearDown(self):
 39 |         """Clean up after each test."""
 40 |         # Remove the temporary directory and its contents
 41 |         shutil.rmtree(self.temp_dir)
 42 | 
 43 |     def test_get_channel_dir(self):
 44 |         """Test getting the channel directory."""
 45 |         # Test with @ prefix
 46 |         channel_dir = self.store.get_channel_dir(f"@{self.test_channel}")
 47 |         self.assertEqual(channel_dir, Path(self.temp_dir) / self.test_channel)
 48 |         
 49 |         # Test without @ prefix
 50 |         channel_dir = self.store.get_channel_dir(self.test_channel)
 51 |         self.assertEqual(channel_dir, Path(self.temp_dir) / self.test_channel)
 52 | 
 53 |     def test_get_existing_post_ids_empty(self):
 54 |         """Test getting existing post IDs when no posts exist."""
 55 |         post_ids = self.store.get_existing_post_ids(self.test_channel)
 56 |         self.assertEqual(post_ids, set())
 57 | 
 58 |     def test_get_existing_post_ids(self):
 59 |         """Test getting existing post IDs from saved posts."""
 60 |         # Save sample posts
 61 |         posts_file = self.channel_dir / "posts.json"
 62 |         with open(posts_file, "w", encoding="utf-8") as f:
 63 |             json.dump(self.sample_posts, f)
 64 |         
 65 |         # Get existing post IDs
 66 |         post_ids = self.store.get_existing_post_ids(self.test_channel)
 67 |         self.assertEqual(post_ids, {1, 2, 3})
 68 | 
 69 |     def test_save_posts_new(self):
 70 |         """Test saving new posts."""
 71 |         # Save posts
 72 |         result = self.store.save_posts(self.sample_posts)
 73 |         self.assertTrue(result)
 74 |         
 75 |         # Check if posts were saved correctly
 76 |         posts_file = self.channel_dir / "posts.json"
 77 |         self.assertTrue(posts_file.exists())
 78 |         
 79 |         with open(posts_file, "r", encoding="utf-8") as f:
 80 |             saved_posts = json.load(f)
 81 |         
 82 |         self.assertEqual(len(saved_posts), 3)
 83 |         self.assertEqual(saved_posts, self.sample_posts)
 84 | 
 85 |     def test_save_posts_with_duplicates(self):
 86 |         """Test saving posts with duplicates."""
 87 |         # Save initial posts
 88 |         self.store.save_posts(self.sample_posts)
 89 |         
 90 |         # Create new posts with some duplicates
 91 |         new_posts = [
 92 |             {"channel": self.test_channel, "post_id": 3, "text": "Post 3"}, # Duplicate
 93 |             {"channel": self.test_channel, "post_id": 4, "text": "Post 4"}, # New
 94 |             {"channel": self.test_channel, "post_id": 5, "text": "Post 5"}, # New
 95 |         ]
 96 |         
 97 |         # Save new posts
 98 |         result = self.store.save_posts(new_posts)
 99 |         self.assertTrue(result)
100 |         
101 |         # Check if posts were merged correctly
102 |         posts_file = self.channel_dir / "posts.json"
103 |         with open(posts_file, "r", encoding="utf-8") as f:
104 |             saved_posts = json.load(f)
105 |         
106 |         # Should have 5 unique posts (1, 2, 3, 4, 5)
107 |         self.assertEqual(len(saved_posts), 5)
108 |         post_ids = {post["post_id"] for post in saved_posts}
109 |         self.assertEqual(post_ids, {1, 2, 3, 4, 5})
110 | 
111 |     @patch("telegraphite.store.logger")
112 |     def test_error_handling(self, mock_logger):
113 |         """Test error handling when saving posts."""
114 |         # Create an invalid posts file (not valid JSON)
115 |         posts_file = self.channel_dir / "posts.json"
116 |         with open(posts_file, "w", encoding="utf-8") as f:
117 |             f.write("invalid json")
118 |         
119 |         # Try to get existing post IDs
120 |         post_ids = self.store.get_existing_post_ids(self.test_channel)
121 |         self.assertEqual(post_ids, set())
122 |         
123 |         # Verify that the error was logged
124 |         mock_logger.error.assert_called_once()
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     unittest.main()


--------------------------------------------------------------------------------