├── .github
└── workflows
│ └── release.yml
├── .gitignore
├── README.md
├── channelslists
├── OsintChannels.txt
├── knownPeople.txt
├── newsAgregators.txt
├── proRuChannels.txt
└── proUkrChannels.txt
├── getTelegram.py
├── my_telegram_scrapper
├── __init__.py
├── client.py
├── models.py
└── parser.py
├── requirements.bat
├── requirements.txt
├── run.bat
└── src
├── config.py
├── gui
├── __init__.py
├── event_handlers.py
├── main_window.py
└── ui_components.py
├── scraper
├── __init__.py
└── core_logic.py
└── utils
├── __init__.py
└── file_utils.py
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Build and Release (PyInstaller)
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | tags:
8 | - 'v*'
9 |
10 | permissions:
11 | contents: write
12 |
13 | jobs:
14 | build-and-release:
15 | runs-on: windows-latest
16 |
17 | steps:
18 | - name: Check out the code
19 | uses: actions/checkout@v3
20 |
21 | - name: Set up Python
22 | uses: actions/setup-python@v4
23 | with:
24 | python-version: '3.9'
25 |
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install -r requirements.txt
30 | pip install pyinstaller
31 |
32 | - name: Build with PyInstaller (--onedir)
33 | run: |
34 | pyinstaller --onedir --noconfirm --clean `
35 | --add-data "channelslists;channelslists" `
36 | --add-data "my_telegram_scrapper;my_telegram_scrapper" `
37 | --add-data "src;src" `
38 | getTelegram.py
39 |
40 | - name: Prepare Release Artifact Name
41 | id: artifact_name
42 | run: |
43 | $tagName = "${{ github.ref_name }}"
44 | $zipFileName = "getTelegram-${tagName}.zip"
45 | echo "zip_file_name=$zipFileName" | Out-File -FilePath $env:GITHUB_OUTPUT -Encoding utf8 -Append
46 | shell: pwsh
47 |
48 | - name: Create ZIP Archive
49 | run: Compress-Archive -Path dist/getTelegram/* -DestinationPath dist/${{ steps.artifact_name.outputs.zip_file_name }}
50 | shell: pwsh
51 |
52 | - name: Create Release
53 | if: startsWith(github.ref, 'refs/tags/')
54 | id: create_release
55 | uses: actions/create-release@v1
56 | with:
57 | tag_name: ${{ github.ref_name }}
58 | release_name: Release ${{ github.ref_name }}
59 | draft: false
60 | prerelease: false
61 | env:
62 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
63 |
64 | - name: Upload Release Asset (ZIP)
65 | if: startsWith(github.ref, 'refs/tags/')
66 | uses: actions/upload-release-asset@v1
67 | with:
68 | upload_url: ${{ steps.create_release.outputs.upload_url }}
69 | asset_path: dist/${{ steps.artifact_name.outputs.zip_file_name }}
70 | asset_name: ${{ steps.artifact_name.outputs.zip_file_name }}
71 | asset_content_type: application/zip
72 | env:
73 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
74 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # --- Python ---
2 | # Byte-compiled / optimized files / DLLs
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # Distribution / packaging
8 | build/
9 | dist/
10 | *.egg-info/
11 | *.spec # PyInstaller spec files
12 |
13 | # Virtual environments (common names)
14 | .env
15 | .venv
16 | env/
17 | venv/
18 | ENV/
19 | VENV/
20 | */.venv/
21 | */env/
22 | */venv/
23 |
24 | # IDE / Editor / OS specific files
25 | .idea/ # IntelliJ IDEA / PyCharm
26 | .vscode/ # Visual Studio Code
27 | *.swp # Vim swap files
28 | .DS_Store # macOS
29 | Thumbs.db # Windows
30 |
31 | # Log files (if generated)
32 | *.log
33 |
34 | # --- Specific to the TelegramOSINTPolo project ---
35 |
36 | # Generated output files with posts
37 | # Ignores files starting with "output_" and ending with ".txt"
38 | output_*.txt
39 |
40 | # Archive directory for old output files
41 | # Ignores the entire 'archive' directory in the project root
42 | /archive/*
43 |
44 | # User channel list files in the channelslists directory
45 | # Ignores all .txt files inside channelslists.
46 | # Consider leaving an example file in the repository (e.g., example.txt)
47 | # and unignoring it using: !channelslists/example.txt
48 | channelslists/*.txt
49 |
50 | # --- Optionally ---
51 | # Files with secrets or sensitive configuration (if you add any)
52 | # e.g. secrets.ini, config.yaml
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TODO:
2 | * LIST OF CHANNELS ARE RANDOM FOR NOW - TODO REVIEW their correct political BIAS
3 | * Workflow for deployment .exe files requires fix. I dont have time for that. Maybe later. Sorry
4 |
5 | # User Manual for the "Download Telegram Posts" Application
6 |
7 | This application is used to download posts from Telegram channels and save them to text files.
8 |
9 | ## 1. Preparing the Channel List:
10 |
11 | * **Recommendation:** Create *two* separate channel lists in `.txt` files. You can divide them thematically (e.g., "news", "sport", "technology") or by *political bias* (e.g., "pro-Russian", "pro-Ukrainian", "neutral"). Such division will facilitate later analysis.
12 | * Each channel should be on a separate line, either as a full URL (e.g., `https://t.me/channel_name`) or just the name (e.g., `channel_name`). ***Do not* add a '/' character at the end of the channel name**.
13 | * If you have channels in the full URL format, the program will automatically remove everything before the last '/' and retrieve only the channel name.
14 | * The program includes a sample list of over 160 channels, but these are just examples. You need to create your actual list yourself[cite: 2, 3, 4, 5, 6].
15 |
16 | 
17 |
18 |
19 | ## 2. Launching the Application:
20 |
21 | * After launching the application, you will see the main program window.
22 |
23 |
24 |
25 |
26 |
27 | ## 3. Configuration:
28 |
29 | * **Select Channel File:**
30 | * Click the "Browse..." button.
31 | * In the dialog window, select the `.txt` file containing the channel list.
32 | * The path to the selected file will appear in the text field.
33 |
34 | * **Select Date:**
35 | * Choose the desired date or date range using the provided options (Specific Date, Date Range, Today, Yesterday, All).
36 |
37 | ## 4. Downloading Posts:
38 |
39 | * Click the relevant "Download..." button based on your date selection.
40 | * The application will start downloading posts from the channels listed in the selected file.
41 | * **Logs (what's happening):** In the lower part of the window, in the "Logs" field, informational messages about the progress will be displayed:
42 | * The name of the currently downloaded channel.
43 | * The number of posts downloaded from that channel.
44 | * The content of the downloaded posts (including the link).
45 | * Any error messages.
46 | * The download process may take a while, depending on the number of channels and posts. Do not close the program window until the process is complete. You can use the "STOP SCRAPING" button if needed.
47 |
48 | ## 5. Saving Results:
49 |
50 | * After the download is complete (or stopped), the application automatically saves the posts to `.txt` file(s).
51 | * The output filename format is: `output_FILENAME-WITH-CHANNELS_YYYY-MM-DD.txt`, where:
52 | * `FILENAME-WITH-CHANNELS` is the name of the file you selected with the channel list (without the `.txt` extension).
53 | * `YYYY-MM-DD` is the date from which the posts were downloaded (year-month-day). For range or 'all' modes, multiple files might be created/updated.
54 | * The output file(s) will be created in the same directory where the application's executable (.exe) or Python script is located. Existing `output_*.txt` files will be moved to an `archive` subfolder before a new scrape starts.
55 | * A success, interruption, or error message will appear in a pop-up window.
56 |
57 | ## 6. Data Analysis with NotebookLM:
58 |
59 | * **Key Step:** After downloading the data (the `output_*.txt` files), you can use Google's *NotebookLM* (https://notebooklm.google.com) to efficiently analyze the collected information. NotebookLM works like RAG (Retrieval-Augmented Generation), which means you can "talk" to your data.
60 |
61 | * There are two versions of the notebook. The free version is perfectly sufficient, but no one is stopping you from buying the Plus version.
62 | > In the NotebookLM version, you can have up to 100 notebooks, and each can contain up to 50 sources. Each source can contain up to half a million words. All users initially get 50 chat queries and can generate 3 audio summaries.
63 | >
64 | > If you upgrade to NotebookLM Plus, these limits increase at least 5-fold – to 500 notebooks and 300 sources per notebook. Daily query limits also increase – you will be able to ask up to 500 chat queries and generate 20 audio summaries each day. When sharing a notebook, the source limit does not change: both you and the people you share the notebook with can upload a maximum of 300 sources to it.
65 |
66 | 
67 |
68 | * **How to use NotebookLM:**
69 | 1. Upload the downloaded `.txt` files to NotebookLM as sources.
70 | 2. NotebookLM will process these files and allow you to ask questions in natural language about their content.
71 | 3. You can ask for summaries, sentiment analysis, search for specific information, compare content from different channels, identify trends, and even generate new texts based on the downloaded data.
72 | 4. Use the notebook to ask questions about the uploaded files.
73 |
74 | 
75 |
76 |
77 | * **Advantages of Analysis in NotebookLM (RAG):**
78 | * **Context:** NotebookLM analyzes your questions in the *context* of the uploaded data. Answers are based *directly* on information from the files, minimizing the risk of hallucinations (the language model inventing information).
79 | * **Precision:** You can refer to specific text fragments, making it easier to verify information and track sources. NotebookLM can indicate where a particular answer comes from.
80 | * **Efficiency:** You don't have to manually search through hundreds of posts. NotebookLM does it for you, saving your time and effort.
81 | * **Deeper Analysis:** Thanks to the ability to ask questions and generate summaries, you can gain much deeper insights into the data than with traditional analysis. You can discover hidden patterns, connections, and trends that might otherwise be missed.
82 | * **Interactivity:** NotebookLM allows dynamic interaction with data. You can modify your queries on the fly and get immediate answers.
83 | * **Security:** NotebookLM, using the uploaded files as its source of information, does not draw information from uncertain sources.
84 |
85 | ## Additional Notes:
86 |
87 | * Ensure you have a stable internet connection while downloading posts.
88 | * For a very large number of channels or posts, downloading may take longer. The 'Download All' option can be particularly time-consuming.
89 | * If an error occurs, check the message content in the "Logs" field and ensure the provided channel name is correct and the channel is publicly accessible via web view.
90 | * The program used the `accless-tg-scraper` library, which worked without using the official Telegram API by scraping the public web preview of channels. https://github.com/Kisspeace/accless-tg-scraper but after some considerations and understanding that updates are nowhere near I had to write my own scrapper from scratch but I still leave link to this repo to point where idea come from.
91 |
92 | ## REMEMBER THAT THE RESPONSIBILITY FOR VERIFYING SOURCES LIES SOLELY WITH YOU. THE NUMBERS NEXT TO THE TEXT (1) IN NOTEBOOKLM ARE LINKS TO QUOTATIONS USED BY THE LLM. THE OUTPUT FILES FROM THIS APPLICATION CONTAIN THE POST CONTENT AND A DIRECT LINK TO THE ORIGINAL POST (2) ON TELEGRAM FOR VERIFICATION.
93 | 
94 |
--------------------------------------------------------------------------------
/channelslists/OsintChannels.txt:
--------------------------------------------------------------------------------
1 | https://t.me/osintbees
2 | https://t.me/DeepStateUA
3 | https://t.me/rybar
4 | https://t.me/osint_69
5 | https://t.me/arrowsmap
6 | https://t.me/CITeam
7 | https://t.me/CIT_shellings
8 | https://t.me/radar_russia_monitor
9 | https://t.me/control_sigma
--------------------------------------------------------------------------------
/channelslists/knownPeople.txt:
--------------------------------------------------------------------------------
1 | https://t.me/AleksandrSemchenko
2 | https://t.me/NeoficialniyBeZsonoV
3 | https://t.me/vrogov
4 | https://t.me/yzheleznyak
5 | https://t.me/olegtsarov
6 | https://t.me/zhivoff22
7 | https://t.me/SergeyKolyasnikov
8 | https://t.me/ASupersharij
9 | https://t.me/epoddubny
10 | https://t.me/khramov_alexander
11 | https://t.me/A_S_Sukonkin
12 | https://t.me/i_strelkov_2023
13 | https://t.me/IgorLinkChannel
14 | https://t.me/VGlagola
15 | https://t.me/RKadyrov_95
16 | https://t.me/bochkala_war
17 | https://t.me/filatovcorr
18 | https://t.me/margaritasimonyan
19 | https://t.me/garmaev_alexander
20 | https://t.me/Tsaplienko
21 | https://t.me/anatoly_nesmiyan
22 | https://t.me/nevzorovtv
23 | https://t.me/Sladkov_plus
24 | https://t.me/sashakots
25 | https://t.me/iistrelkov
26 | https://t.me/montian_official2
27 | https://t.me/wargonzo
28 | https://t.me/montyan2
29 | https://t.me/shevchenkomax_1
30 | https://t.me/strelkovii
31 | https://t.me/pgubarev
32 | https://t.me/BalitskyEV
33 | https://t.me/David_Arakhamia
34 | https://t.me/RSaponkov
35 | https://t.me/stanislav_osman
36 | https://t.me/rubaevCIS
37 | https://t.me/ryabseva_zhanna
38 | https://t.me/agurulev
39 | https://t.me/yurasumy
40 | https://t.me/akashevarova
41 | https://t.me/butrimov
42 | https://t.me/tmelnychuk
43 | https://t.me/ButusovPlus
44 | https://t.me/mardanaka
45 | https://t.me/alexandrshtefanov
46 | https://t.me/aleksandr_skif
47 | https://t.me/romanov_92
48 | https://t.me/JokerDPR
49 | https://t.me/vladlentatarsky
50 | https://t.me/a_shtirlitz
51 | https://t.me/JusuMakonis
52 | https://t.me/shouvalov
53 | https://t.me/daniel_orlov
54 | https://t.me/osirskiy
--------------------------------------------------------------------------------
/channelslists/newsAgregators.txt:
--------------------------------------------------------------------------------
1 | https://t.me/ToBeOrChat
2 | https://t.me/ejdailyru
3 | https://t.me/radiotrek
4 | https://t.me/supernova_plus
5 | https://t.me/objectivetv
6 | https://t.me/InsiderUA_UK
7 | https://t.me/bbc_khm
8 | https://t.me/uniannet
9 | https://t.me/nexta_live
10 | https://t.me/Starkon_city
11 | https://t.me/khmlv
12 | https://t.me/bbbreaking
13 | https://t.me/sotavisionmedia
14 | https://t.me/roy_tv_mk
15 | https://t.me/astrapress
16 | https://t.me/UaOnlii
17 | https://t.me/anna_news
18 | https://t.me/zvizdecmanhustu
19 | https://t.me/BILD_Russian
20 | https://t.me/agentstvonews
21 | https://t.me/bihusinfo
22 | https://t.me/vchkogpu
23 | https://t.me/russianocontext
24 | https://t.me/idelrealii
25 | https://t.me/mobilizationnews
26 | https://t.me/russian_trash_news
27 | https://t.me/horizontal_russia
28 | https://t.me/news_sirena
29 | https://t.me/dirtytatarstan
30 | https://t.me/nsnfm
31 | https://t.me/sledcom_press
32 | https://t.me/ne_zhdi_novosti
33 | https://t.me/brieflyru
34 | https://t.me/tvrain
35 | https://t.me/dagpravdaru
36 | https://t.me/YourNewsTalk
37 | https://t.me/ToBeOr_Official
38 | https://t.me/svobodnieslova
39 | https://t.me/currenttime
40 | https://t.me/milinfolive
41 | https://t.me/meduzalive
42 | https://t.me/opersvodki
43 | https://t.me/bazabazon
44 | https://t.me/playcivilization
45 | https://t.me/rusvesnasu
46 | https://t.me/readovkaru
47 | https://t.me/tradkz
48 | https://t.me/SputnikAtoNews
49 | https://t.me/shot_shot
50 | https://t.me/rbc_news
51 | https://t.me/moscow_laundry
52 | https://t.me/periskop_pacific
53 | https://t.me/regnum_na
--------------------------------------------------------------------------------
/channelslists/proRuChannels.txt:
--------------------------------------------------------------------------------
1 | https://t.me/osetin20
2 | https://t.me/WarZoneInc
3 | https://t.me/Love_Russia_Beauty
4 | https://t.me/skurlatovlive
5 | https://t.me/donrf22
6 | https://t.me/brussinf
7 | https://t.me/economica_russia
8 | https://t.me/russ_orientalist
9 | https://t.me/kremlin_secrets
10 | https://t.me/mrfrostoviklive
11 | https://t.me/mortisaeterna
12 | https://t.me/sidpolit
13 | https://t.me/Kolomna_Gorod
14 | https://t.me/Ugolok_Sitha
15 | https://t.me/kolomna750
16 | https://t.me/dolg_z
17 | https://t.me/combat_hemp
18 | https://t.me/btvt2019
19 | https://t.me/ramzayiegokomanda
20 | https://t.me/philologist_zov
21 | https://t.me/rustroyka1945
22 | https://t.me/m0nstas
23 | https://t.me/osvedomitell_alex
24 | https://t.me/divannaya_brigada
25 | https://t.me/soldat_prav
26 | https://t.me/ZONA_P
27 | https://t.me/russkiy_opolchenec
28 | https://t.me/BKPROGRESSor
29 | https://t.me/SIL0VIKI
30 | https://t.me/motopatriot78
31 | https://t.me/donetchan
32 | https://t.me/Nackepelo
33 | https://t.me/TheDeadDistrict
34 | https://t.me/notes_veterans
35 | https://t.me/babaycalls
36 | https://t.me/tankistrossii100
37 | https://t.me/news_mvddnr
38 | https://t.me/z4lpr
39 | https://t.me/khornegroup
40 | https://t.me/Soldieroffortune777
41 | https://t.me/dosye_shpiona
42 | https://t.me/russkiegramoty
43 | https://t.me/norinea
44 | https://t.me/communitynumber5
45 | https://t.me/OmTVchannel
46 | https://t.me/vozhak_Z
47 | https://t.me/obshina_ru
48 | https://t.me/lost_generation_88
49 | https://t.me/itsdonetsk
--------------------------------------------------------------------------------
/channelslists/proUkrChannels.txt:
--------------------------------------------------------------------------------
1 | https://t.me/slvn_pomet
2 |
3 | https://t.me/dontstopwar
4 |
5 | https://t.me/Operatyvnyi_Donbas
6 |
7 | https://t.me/ukrainian_militant
8 |
9 | https://t.me/batalionmonako
10 |
11 | https://t.me/karymat
12 |
13 | https://t.me/Ateobreaking
14 |
15 | https://t.me/BaluHUB777
16 |
17 | https://t.me/adept_ua
18 |
19 | https://t.me/Za_Derjavy
20 |
21 | https://t.me/odeskaODA
22 |
23 | https://t.me/OSHP_225
24 |
25 | https://t.me/sprava_groma
26 |
27 | https://t.me/ua_stalker
28 |
29 | https://t.me/hochu_zhyt
30 |
31 | https://t.me/warinmyeyes_chat
32 |
33 | https://t.me/braty_yakovlevu
34 |
35 | https://t.me/TyskNIP
36 |
37 | https://t.me/itarmyofukraine2022
38 |
39 | https://t.me/atodoneck
40 |
41 | https://t.me/dtek_ua
42 |
43 | https://t.me/atomiccherry
44 |
45 | https://t.me/Ukr_G_M
46 |
47 | https://t.me/lost_warinua
48 |
49 | https://t.me/khersonskaODA
50 |
51 | https://t.me/ua_hero
--------------------------------------------------------------------------------
/getTelegram.py:
--------------------------------------------------------------------------------
1 | # Main entry point for the Telegram Scraper application.
2 | # import tkinter as tk # OLD
3 | import customtkinter as ctk # NEW
4 | from tkinter import messagebox # Keep messagebox from standard tkinter
5 | import os
6 | import sys
7 | from pathlib import Path # Use pathlib for easier path handling
8 | import tkinter as tk # Potrzebne dla root_err w bloku except
9 |
10 |
11 | # --- Determine Base Directory ---
12 | # (Keep this section as is)
13 | if getattr(sys, 'frozen', False):
14 | base_dir = Path(sys.executable).parent
15 | elif __file__:
16 | base_dir = Path(__file__).parent
17 | else:
18 | base_dir = Path.cwd()
19 |
20 | # --- Dynamically add project root and src to sys.path ---
21 | project_root = base_dir
22 | src_dir = project_root / 'src'
23 | if str(project_root) not in sys.path:
24 | sys.path.insert(0, str(project_root))
25 | if str(src_dir) not in sys.path:
26 | sys.path.insert(0, str(src_dir))
27 |
28 | # --- Set CustomTkinter Appearance ---
29 | ctk.set_appearance_mode("System") # Options: "System", "Light", "Dark"
30 | ctk.set_default_color_theme("blue") # Options: "blue", "green", "dark-blue"
31 |
32 | # --- Import GUI Component and Dependencies ---
33 | try:
34 | from gui.main_window import TelegramScraperGUI
35 | # (Keep other imports and the basic structure of the try/except block)
36 | from scraper import run_scraping
37 | from my_telegram_scrapper import SimpleScraperClient
38 | except ImportError as e:
39 | project_root = base_dir # <-- DODANO TĘ LINIĘ, aby naprawić NameError
40 | error_details = f"{e}\n\n"
41 | error_details += f"Could not import required components.\n"
42 | error_details += f"Please ensure 'src' and 'my_telegram_scrapper' directories exist relative to the executable or script:\n{project_root}\n"
43 | error_details += "Also, verify that all dependencies (including customtkinter) from requirements.txt are installed."
44 | print(f"Fatal Error: {error_details}")
45 | # Attempt to show a GUI error message (using standard tkinter temporarily if ctk fails)
46 | try:
47 | # Use a temporary standard Tk root for the error if ctk fails early
48 | root_err = tk.Tk()
49 | root_err.withdraw()
50 | messagebox.showerror("Startup Error", f"Failed to load application components.\n\n{error_details}")
51 | root_err.destroy()
52 | except Exception: # Catch broader exceptions here, including tk.TclError
53 | print("GUI error: Could not display the error message box.")
54 | sys.exit(1)
55 | except Exception as e:
56 | # Catch any other unexpected error during initial imports
57 | print(f"Fatal Error during startup: {e}")
58 | try:
59 | # Use a temporary standard Tk root for the error if ctk fails early
60 | root_err = tk.Tk()
61 | root_err.withdraw()
62 | messagebox.showerror("Startup Error", f"An unexpected error occurred during initialization:\n\n{e}")
63 | root_err.destroy()
64 | except Exception:
65 | pass # Console print is the fallback
66 | sys.exit(1)
67 |
68 | # --- Main Execution Function ---
69 | def main():
70 | """Sets up and runs the CustomTkinter application."""
71 | # root = tk.Tk() # OLD
72 | root = ctk.CTk() # NEW
73 | try:
74 | # Pass the base_dir (as a string or Path object) to the GUI
75 | app = TelegramScraperGUI(root, str(base_dir)) # Pass as string if GUI expects it
76 | root.minsize(600, 700) # Adjusted minsize slightly
77 | root.mainloop()
78 | except Exception as e:
79 | print(f"Fatal Error running the application: {e}")
80 | # Attempt to show error message if GUI fails during runtime
81 | try:
82 | # customtkinter windows might not have winfo_exists in the same way
83 | # Just try showing the error
84 | messagebox.showerror("Application Error", f"An unexpected error occurred while running:\n\n{e}")
85 | if root: # Check if root object exists
86 | root.destroy()
87 | except Exception: # Catch broader exceptions
88 | pass # Avoid errors if the window is already gone
89 | sys.exit(1)
90 |
91 | # --- Script Entry Point ---
92 | if __name__ == "__main__":
93 | main()
94 |
--------------------------------------------------------------------------------
/my_telegram_scrapper/__init__.py:
--------------------------------------------------------------------------------
1 | # my_scraper/__init__.py
2 | from .client import SimpleScraperClient
3 | from .models import SimpleTgPost, ScrapedPage
4 | # You can add parser functions or model classes here if you want direct access
5 | # e.g., from my_scraper import SimpleScraperClient
--------------------------------------------------------------------------------
/my_telegram_scrapper/client.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from requests.exceptions import RequestException, ConnectionError, Timeout
3 | from typing import Optional, Dict
4 |
5 | from .parser import parse_page
6 | from .models import ScrapedPage
7 |
8 | class SimpleScraperClient:
9 | """
10 | A simple client to fetch and parse Telegram channel web preview pages.
11 | """
12 | BASE_URL: str = "https://t.me"
13 | DEFAULT_USER_AGENT: str = (
14 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
15 | '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
16 | )
17 | REQUEST_TIMEOUT: int = 15 # Slightly longer timeout
18 |
19 | def __init__(self, headers: Optional[Dict[str, str]] = None):
20 | """
21 | Initializes the requests session with default or provided headers.
22 | """
23 | self.session = requests.Session()
24 | # Set default headers to mimic a browser
25 | default_headers = {
26 | 'User-Agent': self.DEFAULT_USER_AGENT,
27 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
28 | 'Accept-Language': 'en-US,en;q=0.9',
29 | }
30 | # Update with provided headers, overriding defaults if necessary
31 | self.session.headers.update(headers or default_headers)
32 |
33 | def get_channel_page(self, channel_username: str, before_token: Optional[str] = None) -> Optional[ScrapedPage]:
34 | """
35 | Fetches and parses a single page of posts from a channel's web view.
36 |
37 | Args:
38 | channel_username: The username of the target channel (without '@').
39 | before_token: The token/ID to fetch posts before this point (for pagination).
40 |
41 | Returns:
42 | A ScrapedPage object containing posts and next page token, or None on error.
43 | """
44 | url = f"{self.BASE_URL}/s/{channel_username}"
45 | params = {}
46 | if before_token:
47 | params['before'] = before_token
48 |
49 | try:
50 | response = self.session.get(url, params=params, timeout=self.REQUEST_TIMEOUT)
51 | response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
52 |
53 | # Check explicit status code, though raise_for_status covers most cases
54 | if response.status_code == 200:
55 | # Parse the HTML content
56 | return parse_page(response.text)
57 | else:
58 | # This case is less likely if raise_for_status() is used, but kept for safety
59 | print(f"Error: Received unexpected status code {response.status_code} for {url}")
60 | return None
61 |
62 | except Timeout:
63 | print(f"Error: Request timed out for {url}")
64 | return None
65 | except ConnectionError:
66 | print(f"Error: Could not connect to {url}. Check network connection.")
67 | return None
68 | except RequestException as e:
69 | # Catches other requests-related errors (like HTTPError from raise_for_status)
70 | print(f"Error fetching {url}: {e}")
71 | return None
72 | except Exception as e:
73 | # Catch potential errors during parsing (though should be handled in parser ideally)
74 | print(f"An unexpected error occurred processing channel '{channel_username}': {e}")
75 | return None
76 |
77 | def close(self):
78 | """Closes the underlying requests session."""
79 | if self.session:
80 | self.session.close()
81 | print("Requests session closed.") # Optional: confirmation
82 |
83 | # Context manager support
84 | def __enter__(self):
85 | return self
86 |
87 | def __exit__(self, exc_type, exc_val, exc_tb):
88 | """Ensures the session is closed when exiting a 'with' block."""
89 | self.close()
--------------------------------------------------------------------------------
/my_telegram_scrapper/models.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from datetime import datetime
3 | from typing import Optional, List
4 |
5 | @dataclass
6 | class SimpleTgAuthor:
7 | """Represents basic information about a post author."""
8 | username: Optional[str] = None
9 | display_name: Optional[str] = None
10 | profile_url: Optional[str] = None
11 |
12 | @dataclass
13 | class SimpleTgPost:
14 | """Represents basic information about a scraped Telegram post."""
15 | post_id: Optional[int] = None
16 | post_url: Optional[str] = None
17 | content: Optional[str] = None
18 | timestamp: Optional[datetime] = None
19 | views: Optional[str] = None # e.g., '1.8K', kept as string for simplicity
20 | # Use field to provide a default_factory for mutable types like classes
21 | author: SimpleTgAuthor = field(default_factory=SimpleTgAuthor)
22 | # Add other fields as needed (e.g., media URLs)
23 | # image_urls: List[str] = field(default_factory=list)
24 | # video_urls: List[str] = field(default_factory=list)
25 |
26 | @dataclass
27 | class ScrapedPage:
28 | """Represents the results from scraping one page of a channel."""
29 | posts: List[SimpleTgPost] = field(default_factory=list)
30 | next_page_token: Optional[str] = None # e.g., the 'before' ID for the next request
31 | # channel_name: Optional[str] = None # Could add channel info here if needed
--------------------------------------------------------------------------------
/my_telegram_scrapper/parser.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup, Tag
2 | from datetime import datetime
3 | import re
4 | from typing import List, Optional
5 |
6 | # Import the dataclass models
7 | from .models import SimpleTgPost, SimpleTgAuthor, ScrapedPage
8 |
9 | TELEGRAM_BASE_URL: str = "https://t.me"
10 |
11 | def _parse_post_id_from_url(url: Optional[str]) -> Optional[int]:
12 | """Extracts the post ID (integer) from a Telegram post URL."""
13 | if not url:
14 | return None
15 | # Regex: find digits preceded by '/' and followed by '?' or end of string
16 | match = re.search(r'/(\d+)(?:\?|$)', url)
17 | return int(match.group(1)) if match else None
18 |
19 | def _parse_username_from_url(url: Optional[str]) -> Optional[str]:
20 | """Extracts the username from a Telegram profile or channel URL."""
21 | if not url:
22 | return None
23 | # Regex: find text after the last '/' but before '?' or end of string
24 | # Handles /s/channel, /channel, etc.
25 | match = re.search(r'/([^/?]+)$', url)
26 | # Alternative if above is too greedy: r'/s?/([^/?]+)'
27 | return match.group(1) if match else None
28 |
29 | def _safe_find_text(element: Optional[Tag], selector: str, strip: bool = True) -> Optional[str]:
30 | """Safely finds an element using a CSS selector and returns its stripped text."""
31 | if not element:
32 | return None
33 | found = element.select_one(selector) # Use CSS selector
34 | return found.get_text(strip=strip) if found else None
35 |
36 | def _safe_get_attr(element: Optional[Tag], selector: str, attribute: str) -> Optional[str]:
37 | """Safely finds an element using a CSS selector and returns a specific attribute."""
38 | if not element:
39 | return None
40 | found = element.select_one(selector)
41 | return found.get(attribute) if found else None
42 |
43 | def parse_single_post(post_element: Tag) -> Optional[SimpleTgPost]:
44 | """Parses a single post HTML element (div.tgme_widget_message_wrap) into a SimpleTgPost object."""
45 | if not isinstance(post_element, Tag):
46 | return None
47 |
48 | post = SimpleTgPost() # Initialize with defaults from dataclass
49 |
50 | # Main message container is crucial
51 | widget_message = post_element.select_one(".tgme_widget_message")
52 | if not widget_message:
53 | print("Warning: Could not find main message container ('.tgme_widget_message') in post element.")
54 | return None # Cannot proceed without this
55 |
56 | # --- Basic Post Info ---
57 | data_post_url = widget_message.get('data-post-url')
58 | data_post = widget_message.get('data-post') # e.g., channel/12345
59 | if data_post_url:
60 | post.post_url = data_post_url
61 | elif data_post:
62 | post.post_url = f"{TELEGRAM_BASE_URL}/{data_post}"
63 | post.post_id = _parse_post_id_from_url(post.post_url)
64 |
65 | # --- Author Info ---
66 | # Look for the primary author name structure first
67 | author_link_tag = widget_message.select_one(".tgme_widget_message_owner_name a")
68 | if author_link_tag:
69 | post.author.profile_url = author_link_tag.get('href')
70 | post.author.username = _parse_username_from_url(post.author.profile_url)
71 | # Get text directly from the link's span if available
72 | author_name_span = author_link_tag.select_one("span") # or "span.name" if specific
73 | post.author.display_name = author_name_span.get_text(strip=True) if author_name_span else author_link_tag.get_text(strip=True)
74 | else:
75 | # Fallback for potentially different structures (e.g., forwarded messages might differ)
76 | author_user_tag = widget_message.select_one(".tgme_widget_message_from_author") # Check for forwarded author
77 | if author_user_tag:
78 | post.author.display_name = author_user_tag.get_text(strip=True)
79 | # Profile URL/username might not be available for forwarded authors in preview
80 |
81 | # --- Content ---
82 | # Select the text element, handling potential variations
83 | text_element = widget_message.select_one(".tgme_widget_message_text")
84 | if text_element:
85 | # Use separator='\n' to preserve line breaks within the post text
86 | post.content = text_element.get_text(separator='\n', strip=True)
87 | else:
88 | # Sometimes content is directly in the message bubble without a specific text class
89 | # This is less reliable and might grab unwanted text like "Forwarded message"
90 | # fallback_text = widget_message.select_one(".tgme_widget_message_bubble > .tgme_widget_message_text") # Example
91 | post.content = None # Or try a broader fallback if needed
92 |
93 | # --- Timestamp ---
94 | time_tag = widget_message.select_one(".tgme_widget_message_date time")
95 | if time_tag and time_tag.get('datetime'):
96 | try:
97 | # Attempt to parse ISO format timestamp (e.g., 2023-10-27T10:30:00+00:00)
98 | post.timestamp = datetime.fromisoformat(time_tag['datetime'])
99 | except ValueError:
100 | print(f"Warning: Could not parse timestamp datetime: {time_tag.get('datetime')}")
101 | post.timestamp = None # Handle parsing errors gracefully
102 |
103 | # --- Views ---
104 | # Views might be inside the date container or separate
105 | post.views = _safe_find_text(widget_message, ".tgme_widget_message_views")
106 |
107 | # --- Placeholder: Add parsing for media (images, videos) if needed ---
108 | # Example (very basic background image style):
109 | # photo_wrap = widget_message.select_one(".tgme_widget_message_photo_wrap[style*='background-image']")
110 | # if photo_wrap:
111 | # style = photo_wrap.get('style', '')
112 | # match = re.search(r"background-image:url\('(.*?)'\)", style)
113 | # if match:
114 | # # post.image_urls.append(match.group(1)) # Assuming image_urls list exists
115 | # pass
116 |
117 | return post
118 |
119 | def parse_page(html_content: str) -> ScrapedPage:
120 | """Parses the HTML content of a Telegram channel's web preview page."""
121 | soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
122 | page_result = ScrapedPage() # Initialize dataclass
123 |
124 | # Find all post container elements (usually divs with this class)
125 | post_elements = soup.select(".tgme_widget_message_wrap") # Use CSS selector
126 |
127 | if not post_elements:
128 | print("Warning: No post elements found with selector '.tgme_widget_message_wrap'. Page structure might have changed.")
129 |
130 | for element in post_elements:
131 | parsed_post = parse_single_post(element)
132 | if parsed_post:
133 | page_result.posts.append(parsed_post)
134 |
135 | # Find the token/ID for the *next* page (link to load *older* posts)
136 | # The 'Load more' link usually contains '?before=...'
137 | load_more_link = soup.select_one('a.tme_messages_more[href*="?before="]')
138 | if load_more_link:
139 | href = load_more_link.get('href', '')
140 | # Extract the 'before' parameter value
141 | match = re.search(r'[?&]before=(\d+)', href)
142 | if match:
143 | page_result.next_page_token = match.group(1)
144 | else:
145 | print("Warning: Found 'Load More' link but could not extract 'before' token.")
146 |
147 | # --- Placeholder: Add parsing for channel info (title, description, etc.) if needed ---
148 | # channel_info_header = soup.select_one(".tgme_channel_info_header_title")
149 | # if channel_info_header:
150 | # page_result.channel_name = channel_info_header.get_text(strip=True)
151 | # pass
152 |
153 | return page_result
--------------------------------------------------------------------------------
/requirements.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | echo Installing required Python packages from requirements.txt...
3 | pip install -r requirements.txt
4 | echo.
5 | echo Installation attempt finished. Check above for any errors.
6 | pause
7 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | beautifulsoup4
3 | lxml
4 | customtkinter
--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | echo Running getTelegram.py...
3 | python getTelegram.py
4 | echo.
5 | echo Program finished or closed.
6 | pause
7 |
--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
1 | # TelegramOSINTPolo-main/src/config.py
2 | from datetime import date
3 |
4 | # --- CONSTANTS ---
5 | ARCHIVE_DIR_NAME: str = "archive"
6 |
7 | # Stop scraping posts older than this date (inclusive)
8 | # Ensures we don't go back indefinitely.
9 | CUTOFF_DATE: date = date(2022, 1, 1)
10 |
11 | # Potentially add other configurations here if needed
12 | # DEFAULT_OUTPUT_FILENAME_FORMAT: str = "output_{list_name}_{date}.txt"
13 | # LOG_LEVEL: str = "INFO"
--------------------------------------------------------------------------------
/src/gui/__init__.py:
--------------------------------------------------------------------------------
1 | # src/gui/__init__.py
2 | # This file makes 'gui' a Python package.
3 | from .main_window import TelegramScraperGUI
--------------------------------------------------------------------------------
/src/gui/event_handlers.py:
--------------------------------------------------------------------------------
1 | # import tkinter as tk # OLD
2 | import customtkinter as ctk # NEW
3 | import tkinter as tk # Keep standard tk for messagebox, TclError, maybe Spinbox type check
4 | from tkinter import filedialog, messagebox, ttk # Keep ttk for Spinbox type check
5 | import threading
6 | import queue
7 | from datetime import date, timedelta, datetime
8 | import calendar
9 | import os
10 | from pathlib import Path
11 | from typing import Optional, Callable, Tuple, Any # Added Any
12 |
13 | from src.config import CUTOFF_DATE
14 | try:
15 | from src.scraper import run_scraping
16 | except ImportError:
17 | messagebox.showerror("Import Error", "Could not load the core scraping module ('src.scraper'). Please check installation.")
18 | import sys
19 | sys.exit(1)
20 |
21 | class GuiEventHandlers:
22 | """Contains event handling methods for the TelegramScraperGUI."""
23 |
24 | def __init__(self, app_instance):
25 | self.app = app_instance
26 |
27 | # --- Log Handling ---
28 | def log_message(self, message: str, level: str = "INFO"):
29 | # (Keep this method as is)
30 | level = level.upper()
31 | if level not in ["DEBUG", "INFO", "WARN", "ERROR"]:
32 | level = "INFO"
33 | timestamp = datetime.now().strftime("%H:%M:%S")
34 | formatted_message = f"[{timestamp}][{level}] {message}"
35 | try:
36 | self.app.log_queue.put(formatted_message)
37 | except AttributeError:
38 | print(f"Fallback Log: {formatted_message}")
39 |
40 | def process_log_queue(self):
41 | """Processes messages from the log queue and updates the GUI log text (CTkTextbox)."""
42 | try:
43 | while not self.app.log_queue.empty():
44 | full_message = self.app.log_queue.get_nowait()
45 | tag = ""
46 | # Determine tag based on level prefix (same logic)
47 | if "[ERROR]" in full_message: tag = "ERROR"
48 | elif "[WARN]" in full_message: tag = "WARN"
49 | elif "[INFO]" in full_message: tag = "INFO"
50 | elif "[DEBUG]" in full_message: tag = "DEBUG"
51 |
52 | # Check if master window and log_text widget exist
53 | # Use ctk checks if available, otherwise standard tkinter checks
54 | if self.app.master and hasattr(self.app, 'log_text') and self.app.log_text:
55 | # CTkTextbox needs state change to insert
56 | self.app.log_text.configure(state="normal")
57 | if tag:
58 | self.app.log_text.insert(ctk.END, full_message + '\n', (tag,))
59 | else:
60 | self.app.log_text.insert(ctk.END, full_message + '\n')
61 | self.app.log_text.see(ctk.END) # Scroll to the end
62 | self.app.log_text.configure(state="disabled") # Disable editing again
63 | except queue.Empty:
64 | pass
65 | except Exception as e:
66 | print(f"Error processing log queue: {e}") # Fallback print
67 | finally:
68 | # Reschedule check only if the master window still exists (basic check)
69 | if self.app.master:
70 | self.app.master.after(100, self.process_log_queue)
71 |
72 |
73 | # --- File Dialog (No longer used for channel list) ---
74 | def open_file_dialog(self):
75 | """Opens a dialog to select the channel list file. (DEPRECATED)"""
76 | self.log_message("Browse button clicked (feature deprecated, use dropdown).", "WARN")
77 | messagebox.showinfo("Info", "Channel list selection is now done via the dropdown menu.")
78 | # Keep original logic commented out or remove if preferred
79 | # initial_dir_path = Path(self.app.base_dir) / "channelslists"
80 | # if not initial_dir_path.is_dir():
81 | # initial_dir_path = Path(self.app.base_dir)
82 | # filename = filedialog.askopenfilename(...)
83 | # if filename: self.app.channellist_path.set(filename) ...
84 |
85 |
86 | # --- Date Validation ---
87 | def validate_date_spinbox(self, prefix: str):
88 | """Adjusts the maximum day for the selected month and year spinboxes."""
89 | # (This logic interacts with tk.IntVar and ttk.Spinbox, should remain compatible)
90 | try:
91 | if prefix == 'sel':
92 | year_var, month_var, day_var = self.app.sel_year, self.app.sel_month, self.app.sel_day
93 | day_spinbox = self.app.day_spinbox
94 | elif prefix == 'start':
95 | year_var, month_var, day_var = self.app.start_year, self.app.start_month, self.app.start_day
96 | day_spinbox = self.app.start_day_spinbox
97 | elif prefix == 'end':
98 | year_var, month_var, day_var = self.app.end_year, self.app.end_month, self.app.end_day
99 | day_spinbox = self.app.end_day_spinbox
100 | else:
101 | self.log_message(f"Invalid prefix '{prefix}' for date validation.", "WARN")
102 | return
103 |
104 | year = year_var.get()
105 | month = month_var.get()
106 |
107 | if 1 <= month <= 12:
108 | _, days_in_month = calendar.monthrange(year, month)
109 | # Configure spinbox only if it exists (using standard tkinter methods)
110 | if day_spinbox and getattr(day_spinbox, 'winfo_exists', lambda: False)(): # Safer check
111 | day_spinbox.config(to=days_in_month)
112 | if day_var.get() > days_in_month:
113 | day_var.set(days_in_month)
114 | except ValueError: pass
115 | except (tk.TclError, AttributeError): pass
116 | except Exception as e:
117 | self.log_message(f"Error validating date spinbox ({prefix}): {e}", "ERROR")
118 |
119 | # --- Date Parsing Helpers ---
120 | def _parse_date_or_show_error(self, year_var, month_var, day_var, date_description: str) -> Optional[date]:
121 | # (This logic uses tk.IntVar.get(), should remain compatible)
122 | # (Keep this method as is)
123 | try:
124 | year_val, month_val, day_val = year_var.get(), month_var.get(), day_var.get()
125 | # Explicitly convert to integers
126 | parsed_date = date(int(year_val), int(month_val), int(day_val))
127 | if parsed_date > date.today():
128 | messagebox.showwarning("Invalid Date", f"The selected {date_description} date ({parsed_date.strftime('%Y-%m-%d')}) cannot be in the future.")
129 | return None
130 | if parsed_date < CUTOFF_DATE:
131 | messagebox.showwarning("Invalid Date", f"The selected {date_description} date ({parsed_date.strftime('%Y-%m-%d')}) must be on or after {CUTOFF_DATE.strftime('%Y-%m-%d')}.")
132 | return None
133 | return parsed_date
134 | except ValueError:
135 | messagebox.showerror("Invalid Date", f"The selected {date_description} date is invalid. Please check the year, month, and day.")
136 | return None
137 |
138 | def _get_dates_for_mode(self, mode: str) -> Optional[Tuple[Optional[date], Optional[date], Optional[date]]]:
139 | # (This logic uses _parse_date_or_show_error, keep as is)
140 | target_date_obj: Optional[date] = None
141 | start_date_obj: Optional[date] = None
142 | end_date_obj: Optional[date] = None
143 |
144 | if mode == 'today': target_date_obj = date.today()
145 | elif mode == 'yesterday': target_date_obj = date.today() - timedelta(days=1)
146 | elif mode == 'specific_date':
147 | target_date_obj = self._parse_date_or_show_error(self.app.sel_year, self.app.sel_month, self.app.sel_day, "specific")
148 | if target_date_obj is None: return None
149 | elif mode == 'date_range':
150 | start_date_obj = self._parse_date_or_show_error(self.app.start_year, self.app.start_month, self.app.start_day, "start")
151 | if start_date_obj is None: return None
152 | end_date_obj = self._parse_date_or_show_error(self.app.end_year, self.app.end_month, self.app.end_day, "end")
153 | if end_date_obj is None: return None
154 | if start_date_obj > end_date_obj:
155 | messagebox.showwarning("Invalid Date Range", "The 'Start Date' cannot be later than the 'End Date'.")
156 | return None
157 | elif mode == 'all': pass
158 | return target_date_obj, start_date_obj, end_date_obj
159 |
160 |
161 | # --- Scraping Control ---
162 | def start_scraping_base(self, mode: str):
163 | """Base function to validate inputs and initiate scraping for any mode."""
164 | if self.app.scraping_thread and self.app.scraping_thread.is_alive():
165 | messagebox.showwarning("Process Running", "A scraping process is already active.")
166 | return
167 |
168 | # === NEW: Validate Channel List File from Dropdown ===
169 | selected_filename = self.app.channellist_path.get().strip()
170 | if not selected_filename or selected_filename in ["No lists found", "Error reading lists", "Error scanning lists"]:
171 | messagebox.showwarning("Missing Input", "Please select a valid channel list from the dropdown.\nEnsure the 'channelslists' folder exists and contains .txt files.")
172 | return
173 |
174 | # Construct the full path
175 | channelslists_dir = Path(self.app.base_dir) / "channelslists"
176 | channellist_file = str(channelslists_dir / selected_filename) # Convert Path to string for os.path.exists
177 |
178 | if not os.path.exists(channellist_file): # Check full path existence
179 | messagebox.showerror("File Error", f"The selected channel list file does not seem to exist:\n{channellist_file}")
180 | # Consider re-populating the dropdown here if the file vanished
181 | # self.app._populate_channel_list_dropdown()
182 | return
183 | # === END NEW ===
184 |
185 | # Get and Validate Dates (No changes needed here)
186 | date_info = self._get_dates_for_mode(mode)
187 | if date_info is None: return
188 | target_date_obj, start_date_obj, end_date_obj = date_info
189 |
190 | # --- Start Scraping Thread ---
191 | self.app.stop_event.clear()
192 |
193 | # Clear log area (using CTkTextbox configure)
194 | if hasattr(self.app, 'log_text') and self.app.log_text:
195 | self.app.log_text.configure(state="normal")
196 | self.app.log_text.delete('1.0', ctk.END)
197 | self.app.log_text.configure(state="disabled")
198 |
199 | self.log_message(f"Initiating scraping process (Mode: '{mode}', List: '{selected_filename}')...", "INFO")
200 | self.disable_action_buttons()
201 |
202 | self.app.scraping_thread = threading.Thread(
203 | target=self.scrape_in_thread,
204 | # Pass the FULL PATH to the scrape function
205 | args=(channellist_file, mode, target_date_obj, start_date_obj, end_date_obj),
206 | daemon=True
207 | )
208 | self.app.scraping_thread.start()
209 |
210 | def stop_scraping(self):
211 | # (Keep this method as is, stop_event is independent of UI lib)
212 | if self.app.scraping_thread and self.app.scraping_thread.is_alive():
213 | self.log_message("Stop signal sent to scraping thread.", "WARN")
214 | self.app.stop_event.set()
215 | if hasattr(self.app, 'stop_button') and self.app.stop_button:
216 | self.app.stop_button.configure(state="disabled") # Use configure for ctk
217 | else:
218 | self.log_message("No active scraping process to stop.", "INFO")
219 |
220 | def scrape_in_thread(self, channellist_file, mode, target_date, start_date, end_date):
221 | # (Keep this method's core logic - it calls run_scraping)
222 | # (run_scraping is UI independent, just uses callbacks)
223 | output_files = []
224 | error_occurred = False
225 | final_message = "An unknown error occurred."
226 | final_message_type = "ERROR"
227 |
228 | try:
229 | output_files = run_scraping(
230 | channellist_file=channellist_file, # Pass the full path
231 | mode=mode,
232 | target_date=target_date,
233 | start_date=start_date,
234 | end_date=end_date,
235 | log_callback=self.log_message,
236 | stop_event=self.app.stop_event,
237 | base_dir=self.app.base_dir
238 | )
239 |
240 | if self.app.stop_event.is_set():
241 | final_message = "Scraping process was interrupted by the user."
242 | final_message_type = "WARN"
243 | elif not output_files:
244 | final_date_to_show = target_date if mode != 'date_range' else end_date
245 | start_date_for_msg = start_date if mode == 'date_range' else None
246 | final_message = self._generate_no_posts_message(mode, final_date_to_show, start_date_for_msg)
247 | final_message_type = "INFO"
248 | else:
249 | files_str = "\n".join([os.path.basename(f) for f in output_files])
250 | final_message = f"Scraping completed successfully.\nCreated/updated files:\n{files_str}"
251 | final_message_type = "SUCCESS"
252 |
253 | except ImportError as e:
254 | error_occurred = True
255 | final_message = f"Import Error: {e}\nCannot run scraping. Check installation and file structure."
256 | self.log_message(final_message, "ERROR")
257 | except (FileNotFoundError, ValueError, RuntimeError, NameError) as e:
258 | error_occurred = True
259 | final_message = f"Scraping failed: {e}"
260 | except Exception as e:
261 | error_occurred = True
262 | final_message = f"An unexpected critical error occurred: {type(e).__name__} - {e}"
263 | self.log_message(final_message, "ERROR")
264 |
265 | finally:
266 | # --- Schedule GUI updates back on the main thread ---
267 | if self.app.master: # Basic check if master exists
268 | self.app.master.after(0, self.show_final_message, final_message, final_message_type, error_occurred)
269 | self.app.master.after(0, self.reset_buttons)
270 |
271 | def _generate_no_posts_message(self, mode: str, target_date: Optional[date], start_date: Optional[date]) -> str:
272 | # (Keep this method as is)
273 | date_info = ""
274 | cutoff_str = f" (after {CUTOFF_DATE.strftime('%Y-%m-%d')})"
275 | if mode == 'date_range' and start_date and target_date: date_info = f" for range {start_date.strftime('%Y-%m-%d')} to {target_date.strftime('%Y-%m-%d')}"
276 | elif target_date and mode != 'all': date_info = f" for {target_date.strftime('%Y-%m-%d')}"
277 | elif mode == 'all': date_info = " in 'all' mode"
278 | return f"No posts matching the criteria were found{date_info}{cutoff_str}."
279 |
280 | # --- GUI Message Functions (run in main thread via master.after) ---
281 | def show_final_message(self, message: str, message_type: str, error_occurred: bool):
282 | # (Keep this method as is, uses standard messagebox)
283 | try:
284 | if self.app.master:
285 | if message_type == "SUCCESS": messagebox.showinfo("Success!", message)
286 | elif message_type == "INFO": messagebox.showinfo("No Results", message)
287 | elif message_type == "WARN": messagebox.showwarning("Interrupted", message)
288 | else: # ERROR
289 | full_error_msg = f"{message}\n\nPlease check the logs for more details."
290 | messagebox.showerror("Error", full_error_msg)
291 | except tk.TclError: pass # Window might have been closed
292 |
293 | # --- Button State Management ---
294 | def _set_button_state(self, button_name: str, state: str): # Use string state for ctk
295 | """Safely sets the state of a button widget attribute on the app instance."""
296 | button_widget = getattr(self.app, button_name, None)
297 | # Check for CTkButton, fallback to tk.Button/ttk.Button might be needed if mixing
298 | # Also check if Spinboxes are controlled here - they use standard tk state
299 | if isinstance(button_widget, (ctk.CTkButton)):
300 | # Check if widget is destroyed - basic check if object exists
301 | if button_widget:
302 | try:
303 | button_widget.configure(state=state) # Use configure for ctk
304 | except Exception as e: # Catch broad exceptions
305 | self.log_message(f"Could not configure button '{button_name}': {e}", "WARN")
306 | pass
307 | elif isinstance(button_widget, (ttk.Spinbox)): # Handle spinboxes if needed
308 | if button_widget and getattr(button_widget, 'winfo_exists', lambda: False)():
309 | try:
310 | # Spinbox uses standard tk state constants
311 | tk_state = tk.NORMAL if state == "normal" else tk.DISABLED
312 | button_widget.config(state=tk_state)
313 | except (tk.TclError, AttributeError): pass
314 |
315 |
316 | def disable_action_buttons(self):
317 | """Disables all action buttons and enables the stop button."""
318 | if not self.app.master: return
319 | buttons_to_disable = [
320 | 'specific_date_button', 'range_date_button', 'today_button',
321 | 'yesterday_button', 'all_button',
322 | 'channel_list_dropdown' # Disable dropdown during run
323 | # 'browse_button' # Removed
324 | ]
325 | # Also disable spinboxes
326 | spinboxes_to_disable = [
327 | 'day_spinbox', 'month_spinbox', 'year_spinbox',
328 | 'start_day_spinbox', 'start_month_spinbox', 'start_year_spinbox',
329 | 'end_day_spinbox', 'end_month_spinbox', 'end_year_spinbox'
330 | ]
331 | for btn_name in buttons_to_disable + spinboxes_to_disable:
332 | self._set_button_state(btn_name, "disabled") # Use string state "disabled"
333 | self._set_button_state('stop_button', "normal") # Use string state "normal"
334 |
335 | def reset_buttons(self):
336 | """Resets button states after scraping finishes or is stopped."""
337 | if not self.app.master: return
338 | buttons_to_enable = [
339 | 'specific_date_button', 'range_date_button', 'today_button',
340 | 'yesterday_button', 'all_button',
341 | 'channel_list_dropdown' # Re-enable dropdown
342 | # 'browse_button' # Removed
343 | ]
344 | # Also enable spinboxes
345 | spinboxes_to_enable = [
346 | 'day_spinbox', 'month_spinbox', 'year_spinbox',
347 | 'start_day_spinbox', 'start_month_spinbox', 'start_year_spinbox',
348 | 'end_day_spinbox', 'end_month_spinbox', 'end_year_spinbox'
349 | ]
350 | for btn_name in buttons_to_enable + spinboxes_to_enable:
351 | self._set_button_state(btn_name, "normal")
352 | self._set_button_state('stop_button', "disabled")
353 |
354 |
355 | # --- Window Closing Handler ---
356 | def on_closing(self):
357 | # (Keep logic, check master existence simply)
358 | if self.app.scraping_thread and self.app.scraping_thread.is_alive():
359 | if messagebox.askyesno("Confirm Exit", "Scraping is still in progress.\nDo you want to stop the process and exit?"):
360 | self.log_message("Exit requested during active scraping. Sending stop signal...", "WARN")
361 | self.app.stop_event.set()
362 | # Use destroy directly after a short delay
363 | self.app.master.after(200, self.app.master.destroy)
364 | else:
365 | return # Do not close
366 | else:
367 | self.log_message("Application closing.", "INFO")
368 | if self.app.master: # Check before destroying
369 | self.app.master.destroy()
--------------------------------------------------------------------------------
/src/gui/main_window.py:
--------------------------------------------------------------------------------
1 | import customtkinter as ctk
2 | from tkinter import messagebox
3 | import queue
4 | import threading
5 | from datetime import date
6 | from typing import Optional, List
7 | from pathlib import Path
8 | import os
9 |
10 | # Import UI component creation functions and event handlers
11 | from .ui_components import (
12 | create_file_selection_ui,
13 | create_specific_date_picker_ui,
14 | create_date_range_picker_ui,
15 | create_action_buttons_ui,
16 | create_log_ui,
17 | # Import the analysis info UI function separately
18 | create_analysis_info_ui
19 | )
20 | from .event_handlers import GuiEventHandlers
21 | from src.config import CUTOFF_DATE # Ensure CUTOFF_DATE is imported if used here, though likely not directly
22 |
23 | class TelegramScraperGUI:
24 | """
25 | Main class for the Telegram Scraper GUI application using CustomTkinter.
26 | Orchestrates UI setup and event handling with a grid layout.
27 | """
28 | def __init__(self, master: ctk.CTk, base_dir: str):
29 | """
30 | Initializes the main GUI window.
31 |
32 | Args:
33 | master: The root CustomTkinter window (ctk.CTk instance).
34 | base_dir: The base directory path (string) for file operations.
35 | """
36 | self.master: ctk.CTk = master
37 | self.base_dir: str = base_dir
38 | self.master.title("Telegram Post Downloader v3.2 (Grid Layout)") # Updated version/title
39 | self.master.geometry("850x750") # Adjusted size for sidebar
40 |
41 | # --- Configure root window's grid ---
42 | # Column 0 (main content) will expand, Column 1 (sidebar) fixed width
43 | self.master.grid_columnconfigure(0, weight=1)
44 | self.master.grid_columnconfigure(1, weight=0) # Sidebar doesn't expand horizontally
45 | # Row 0 will contain everything and expand vertically
46 | self.master.grid_rowconfigure(0, weight=1)
47 |
48 | # --- CustomTkinter Variables ---
49 | self.channellist_path = ctk.StringVar() # Will store the selected *filename*
50 |
51 | # Date Picker Variables (initialize with today's date)
52 | today = date.today()
53 | self.sel_year = ctk.IntVar(value=today.year)
54 | self.sel_month = ctk.IntVar(value=today.month)
55 | self.sel_day = ctk.IntVar(value=today.day)
56 | self.start_year = ctk.IntVar(value=today.year)
57 | self.start_month = ctk.IntVar(value=today.month)
58 | self.start_day = ctk.IntVar(value=1) # Default start day to 1st
59 | self.end_year = ctk.IntVar(value=today.year)
60 | self.end_month = ctk.IntVar(value=today.month)
61 | self.end_day = ctk.IntVar(value=today.day) # Default end day to today
62 |
63 | # --- Threading and Logging ---
64 | self.log_queue: queue.Queue[str] = queue.Queue()
65 | self.stop_event = threading.Event()
66 | self.scraping_thread: Optional[threading.Thread] = None
67 |
68 | # --- Initialize Event Handlers ---
69 | self.handlers = GuiEventHandlers(self)
70 |
71 | # --- Create Main Frames using grid ---
72 | # Main content frame on the left
73 | self.main_content_frame = ctk.CTkFrame(master, corner_radius=0, fg_color="transparent")
74 | # Place in grid cell (0,0), make it stick to all sides (nsew)
75 | self.main_content_frame.grid(row=0, column=0, padx=(10, 5), pady=10, sticky="nsew")
76 | # Configure internal row for log frame (assuming 5 main widgets packed above it)
77 | self.main_content_frame.grid_rowconfigure(5, weight=1) # Allow log frame (index 5 if 5 packed above) to expand
78 |
79 | # Sidebar frame on the right
80 | self.sidebar_frame = ctk.CTkFrame(master, width=200, corner_radius=0) # Keep defined width
81 | # Place in grid cell (0,1), make it stick vertically (ns)
82 | self.sidebar_frame.grid(row=0, column=1, padx=(5, 10), pady=10, sticky="ns")
83 | # Prevent sidebar from shrinking to content
84 | self.sidebar_frame.grid_propagate(False)
85 |
86 |
87 | # --- Create UI Sections (using pack inside their respective frames) ---
88 | # Widgets packed into main_content_frame
89 | create_file_selection_ui(self.main_content_frame, self)
90 | create_specific_date_picker_ui(self.main_content_frame, self)
91 | create_date_range_picker_ui(self.main_content_frame, self)
92 | create_action_buttons_ui(self.main_content_frame, self)
93 | create_log_ui(self.main_content_frame, self) # This creates self.log_text
94 |
95 | # Widgets packed into sidebar_frame
96 | create_analysis_info_ui(self.sidebar_frame, self)
97 |
98 | # --- Populate Channel List Dropdown ---
99 | self._populate_channel_list_dropdown()
100 |
101 | # --- Initialize and Start Log Processing Loop ---
102 | self.process_log_queue()
103 |
104 | # --- Initial Validation for Date Pickers ---
105 | self.validate_date_spinbox('sel')
106 | self.validate_date_spinbox('start')
107 | self.validate_date_spinbox('end')
108 |
109 | # --- Window Close Protocol ---
110 | self.master.protocol("WM_DELETE_WINDOW", self.on_closing)
111 |
112 | # Log application start
113 | self.log_message("Application initialized with grid layout.", "INFO")
114 |
115 | def _populate_channel_list_dropdown(self):
116 | """Finds .txt files in 'channelslists' and populates the dropdown."""
117 | channelslists_dir = Path(self.base_dir) / "channelslists"
118 | channel_files: List[str] = []
119 | default_selection = "No lists found"
120 |
121 | # Ensure dropdown widget exists before trying to configure it
122 | if not hasattr(self, 'channel_list_dropdown') or not self.channel_list_dropdown:
123 | self.log_message("Channel list dropdown widget not yet created.", "ERROR")
124 | self.channellist_path.set(default_selection) # Set variable anyway
125 | return
126 |
127 | if channelslists_dir.is_dir():
128 | try:
129 | # Get only filenames, filter for .txt, sort alphabetically
130 | channel_files = sorted([
131 | f.name for f in channelslists_dir.glob("*.txt") if f.is_file()
132 | ])
133 | if channel_files:
134 | default_selection = channel_files[0] # Default to the first file found
135 | # Configure the dropdown
136 | self.channel_list_dropdown.configure(values=channel_files, state="readonly") # Use readonly state
137 | self.log_message(f"Found channel lists: {', '.join(channel_files)}", "DEBUG")
138 | else:
139 | self.log_message(f"No .txt files found in {channelslists_dir}", "WARN")
140 | self.channel_list_dropdown.configure(values=[default_selection], state="disabled")
141 |
142 | except OSError as e:
143 | self.log_message(f"Error reading channel list directory {channelslists_dir}: {e}", "ERROR")
144 | self.channel_list_dropdown.configure(values=[f"Error reading lists"], state="disabled")
145 | default_selection = "Error reading lists"
146 | except Exception as e:
147 | self.log_message(f"Unexpected error scanning for channel lists: {e}", "ERROR")
148 | self.channel_list_dropdown.configure(values=[f"Error scanning lists"], state="disabled")
149 | default_selection = "Error scanning lists"
150 | else:
151 | self.log_message(f"Channel list directory not found: {channelslists_dir}", "WARN")
152 | self.channel_list_dropdown.configure(values=[default_selection], state="disabled")
153 |
154 | # Set the variable for the dropdown
155 | self.channellist_path.set(default_selection)
156 |
157 |
158 | # --- Method Delegation to Handlers ---
159 | # These methods provide a clean interface and delegate the actual work
160 | # to the GuiEventHandlers instance.
161 |
162 | def open_file_dialog(self): # Deprecated method
163 | self.handlers.open_file_dialog()
164 |
165 | def validate_date_spinbox(self, prefix: str):
166 | self.handlers.validate_date_spinbox(prefix)
167 |
168 | def log_message(self, message: str, level: str = "INFO"):
169 | """Logs a message via the handler (which queues it)."""
170 | self.handlers.log_message(message, level)
171 |
172 | def process_log_queue(self):
173 | """Starts or continues processing the log queue via the handler."""
174 | self.handlers.process_log_queue()
175 |
176 | def start_scraping_base(self, mode: str):
177 | """Initiates scraping via the handler."""
178 | self.handlers.start_scraping_base(mode)
179 |
180 | def stop_scraping(self):
181 | """Stops scraping via the handler."""
182 | self.handlers.stop_scraping()
183 |
184 | def disable_action_buttons(self):
185 | """Disables buttons during scraping via the handler."""
186 | self.handlers.disable_action_buttons()
187 |
188 | def reset_buttons(self):
189 | """Resets button states via the handler."""
190 | self.handlers.reset_buttons()
191 |
192 | def on_closing(self):
193 | """Handles window closing via the handler."""
194 | self.handlers.on_closing()
195 |
196 | # Note: The actual Tkinter mainloop is called in getTelegram.py, not here.
--------------------------------------------------------------------------------
/src/gui/ui_components.py:
--------------------------------------------------------------------------------
1 | import customtkinter as ctk
2 | from tkinter import ttk
3 | from datetime import date
4 | import os
5 | import webbrowser # Import webbrowser for opening links
6 |
7 | # Import config only for CUTOFF_DATE display/limits
8 | from src.config import CUTOFF_DATE
9 |
10 | # Standard padding values
11 | PAD_X = 10
12 | PAD_Y = 5
13 | INNER_PAD_X = 5
14 | INNER_PAD_Y = 5
15 |
16 | # --- File Selection, Date Pickers, Action Buttons ---
17 | # (Keep create_file_selection_ui, create_specific_date_picker_ui,
18 | # create_date_range_picker_ui, create_action_buttons_ui as they were
19 | # in the customtkinter version from the previous steps)
20 | # Example placeholder for one function:
21 | def create_file_selection_ui(master_frame: ctk.CTk, app_instance):
22 | """Creates the UI section for selecting the channel list file using a dropdown."""
23 | file_frame = ctk.CTkFrame(master_frame)
24 | file_frame.pack(padx=PAD_X, pady=(PAD_Y * 2, PAD_Y), fill="x", anchor="n")
25 | section_label = ctk.CTkLabel(file_frame, text="1. Select Channel List", font=ctk.CTkFont(weight="bold"))
26 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0))
27 | app_instance.channel_list_dropdown = ctk.CTkComboBox(
28 | file_frame, variable=app_instance.channellist_path, state="readonly", width=250
29 | )
30 | app_instance.channel_list_dropdown.pack(pady=INNER_PAD_Y, padx=INNER_PAD_X)
31 | info_label = ctk.CTkLabel( file_frame, text="Select a list. Lists are loaded from 'channelslists' folder.", font=ctk.CTkFont(size=10), text_color="gray")
32 | info_label.pack(pady=(0, INNER_PAD_Y), padx=INNER_PAD_X)
33 | return file_frame
34 |
35 | # --- (Include the other create_*_ui functions here from previous steps) ---
36 | def create_specific_date_picker_ui(master_frame: ctk.CTk, app_instance):
37 | # ... (Implementation from previous step) ...
38 | date_frame = ctk.CTkFrame(master_frame)
39 | date_frame.pack(padx=PAD_X, pady=PAD_Y, fill="x", anchor="n")
40 | section_label = ctk.CTkLabel(date_frame, text="2a. Download for Specific Date", font=ctk.CTkFont(weight="bold"))
41 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0))
42 | date_picker_inner_frame = ctk.CTkFrame(date_frame, fg_color="transparent")
43 | date_picker_inner_frame.pack(pady=INNER_PAD_Y, fill="x", padx=INNER_PAD_X)
44 | date_spin_frame = ctk.CTkFrame(date_picker_inner_frame, fg_color="transparent")
45 | date_spin_frame.pack(side="left", padx=(0, PAD_X))
46 | current_year = date.today().year; min_year = CUTOFF_DATE.year
47 | ctk.CTkLabel(date_spin_frame, text="Day:", width=30).pack(side="left", padx=(0, 2))
48 | app_instance.day_spinbox = ttk.Spinbox(date_spin_frame, from_=1, to=31, textvariable=app_instance.sel_day, width=4, command=lambda: app_instance.validate_date_spinbox('sel'))
49 | app_instance.day_spinbox.pack(side="left", padx=(0, 8))
50 | ctk.CTkLabel(date_spin_frame, text="Month:", width=40).pack(side="left", padx=(0, 2))
51 | app_instance.month_spinbox = ttk.Spinbox( date_spin_frame, from_=1, to=12, textvariable=app_instance.sel_month, width=4, command=lambda: app_instance.validate_date_spinbox('sel'))
52 | app_instance.month_spinbox.pack(side="left", padx=(0, 8))
53 | ctk.CTkLabel(date_spin_frame, text="Year:", width=35).pack(side="left", padx=(0, 2))
54 | app_instance.year_spinbox = ttk.Spinbox( date_spin_frame, from_=min_year, to=current_year, textvariable=app_instance.sel_year, width=6, command=lambda: app_instance.validate_date_spinbox('sel'))
55 | app_instance.year_spinbox.pack(side="left")
56 | app_instance.specific_date_button = ctk.CTkButton(date_picker_inner_frame, text="Download This Date", command=lambda: app_instance.start_scraping_base('specific_date'), width=160)
57 | app_instance.specific_date_button.pack(side="left", padx=(PAD_X, 0))
58 | return date_frame
59 |
60 | def create_date_range_picker_ui(master_frame: ctk.CTk, app_instance):
61 | # ... (Implementation from previous step) ...
62 | range_frame = ctk.CTkFrame(master_frame)
63 | range_frame.pack(padx=PAD_X, pady=PAD_Y, fill="x", anchor="n")
64 | section_label = ctk.CTkLabel(range_frame, text="2b. Download Date Range", font=ctk.CTkFont(weight="bold"))
65 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0))
66 | current_year = date.today().year; min_year = CUTOFF_DATE.year; label_width = 70
67 | start_frame = ctk.CTkFrame(range_frame, fg_color="transparent")
68 | start_frame.pack(pady=(INNER_PAD_Y, 2), fill="x", padx=INNER_PAD_X)
69 | ctk.CTkLabel(start_frame, text="Start Date:", width=label_width, anchor='w').pack(side="left", padx=(0, INNER_PAD_X))
70 | start_spin_frame = ctk.CTkFrame(start_frame, fg_color="transparent")
71 | start_spin_frame.pack(side="left")
72 | ctk.CTkLabel(start_spin_frame, text="D:", width=15).pack(side="left", padx=(0, 1))
73 | app_instance.start_day_spinbox = ttk.Spinbox(start_spin_frame, from_=1, to=31, width=4, textvariable=app_instance.start_day, command=lambda: app_instance.validate_date_spinbox('start'))
74 | app_instance.start_day_spinbox.pack(side="left", padx=(0, 5))
75 | ctk.CTkLabel(start_spin_frame, text="M:", width=15).pack(side="left", padx=(0, 1))
76 | app_instance.start_month_spinbox = ttk.Spinbox(start_spin_frame, from_=1, to=12, width=4, textvariable=app_instance.start_month, command=lambda: app_instance.validate_date_spinbox('start'))
77 | app_instance.start_month_spinbox.pack(side="left", padx=(0, 5))
78 | ctk.CTkLabel(start_spin_frame, text="Y:", width=15).pack(side="left", padx=(0, 1))
79 | app_instance.start_year_spinbox = ttk.Spinbox(start_spin_frame, from_=min_year, to=current_year, width=6, textvariable=app_instance.start_year, command=lambda: app_instance.validate_date_spinbox('start'))
80 | app_instance.start_year_spinbox.pack(side="left")
81 | end_frame = ctk.CTkFrame(range_frame, fg_color="transparent")
82 | end_frame.pack(pady=2, fill="x", padx=INNER_PAD_X)
83 | ctk.CTkLabel(end_frame, text="End Date:", width=label_width, anchor='w').pack(side="left", padx=(0, INNER_PAD_X))
84 | end_spin_frame = ctk.CTkFrame(end_frame, fg_color="transparent")
85 | end_spin_frame.pack(side="left")
86 | ctk.CTkLabel(end_spin_frame, text="D:", width=15).pack(side="left", padx=(0, 1))
87 | app_instance.end_day_spinbox = ttk.Spinbox(end_spin_frame, from_=1, to=31, width=4, textvariable=app_instance.end_day, command=lambda: app_instance.validate_date_spinbox('end'))
88 | app_instance.end_day_spinbox.pack(side="left", padx=(0, 5))
89 | ctk.CTkLabel(end_spin_frame, text="M:", width=15).pack(side="left", padx=(0, 1))
90 | app_instance.end_month_spinbox = ttk.Spinbox(end_spin_frame, from_=1, to=12, width=4, textvariable=app_instance.end_month, command=lambda: app_instance.validate_date_spinbox('end'))
91 | app_instance.end_month_spinbox.pack(side="left", padx=(0, 5))
92 | ctk.CTkLabel(end_spin_frame, text="Y:", width=15).pack(side="left", padx=(0, 1))
93 | app_instance.end_year_spinbox = ttk.Spinbox(end_spin_frame, from_=min_year, to=current_year, width=6, textvariable=app_instance.end_year, command=lambda: app_instance.validate_date_spinbox('end'))
94 | app_instance.end_year_spinbox.pack(side="left")
95 | button_frame = ctk.CTkFrame(range_frame, fg_color="transparent")
96 | button_frame.pack(pady=(INNER_PAD_Y * 2, INNER_PAD_Y))
97 | app_instance.range_date_button = ctk.CTkButton(button_frame, text="Download Date Range", command=lambda: app_instance.start_scraping_base('date_range'), width=180)
98 | app_instance.range_date_button.pack()
99 | min_date_str = CUTOFF_DATE.strftime('%Y-%m-%d')
100 | ctk.CTkLabel(range_frame, text=f"Note: Data is available from {min_date_str} onwards.", text_color="gray").pack(pady=(0, INNER_PAD_Y), anchor='center')
101 | return range_frame
102 |
103 | def create_action_buttons_ui(master_frame: ctk.CTk, app_instance):
104 | # ... (Implementation from previous step) ...
105 | actions_frame = ctk.CTkFrame(master_frame)
106 | actions_frame.pack(padx=PAD_X, pady=PAD_Y, fill="x", anchor="n")
107 | section_label = ctk.CTkLabel(actions_frame, text="2c. Quick Actions / All / Stop", font=ctk.CTkFont(weight="bold"))
108 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0))
109 | button_inner_frame = ctk.CTkFrame(actions_frame, fg_color="transparent")
110 | button_inner_frame.pack(pady=INNER_PAD_Y)
111 | app_instance.today_button = ctk.CTkButton(button_inner_frame, text="Download Today", command=lambda: app_instance.start_scraping_base('today'), width=150)
112 | app_instance.today_button.pack(side="left", padx=INNER_PAD_X)
113 | app_instance.yesterday_button = ctk.CTkButton(button_inner_frame, text="Download Yesterday", command=lambda: app_instance.start_scraping_base('yesterday'), width=150)
114 | app_instance.yesterday_button.pack(side="left", padx=INNER_PAD_X)
115 | all_button_text = f"Download All (since {CUTOFF_DATE.year})"
116 | app_instance.all_button = ctk.CTkButton(button_inner_frame, text=all_button_text, command=lambda: app_instance.start_scraping_base('all'), width=180)
117 | app_instance.all_button.pack(side="left", padx=INNER_PAD_X)
118 | app_instance.stop_button = ctk.CTkButton(actions_frame, text="STOP SCRAPING", command=app_instance.stop_scraping, state="disabled", width=200, fg_color="#D32F2F", hover_color="#B71C1C", text_color="white", font=ctk.CTkFont(weight="bold"))
119 | app_instance.stop_button.pack(pady=(INNER_PAD_Y, INNER_PAD_Y*2))
120 | ctk.CTkLabel(actions_frame, text="Warning: 'Download All' can take long & create many files!", text_color="#FF8C00").pack(pady=(0,INNER_PAD_Y))
121 | return actions_frame
122 | # --- End Placeholder ---
123 |
124 |
125 | def create_log_ui(master_frame: ctk.CTk, app_instance):
126 | """Creates the scrollable logging text area."""
127 | log_frame = ctk.CTkFrame(master_frame)
128 | # Make log frame expand vertically in the main content area
129 | log_frame.pack(padx=PAD_X, pady=PAD_Y, fill="both", expand=True)
130 |
131 | section_label = ctk.CTkLabel(log_frame, text="Logs", font=ctk.CTkFont(weight="bold"))
132 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0))
133 |
134 | app_instance.log_text = ctk.CTkTextbox(
135 | log_frame, wrap="word", height=150, state="disabled"
136 | )
137 | app_instance.log_text.pack(fill="both", expand=True, padx=INNER_PAD_X, pady=INNER_PAD_Y)
138 |
139 | # Configure tags (keep as before)
140 | colors = {"ERROR": "#FF0000", "WARN": "#FFA500", "INFO": "#007ACC", "DEBUG": "#808080"}
141 | for tag, color in colors.items():
142 | app_instance.log_text.tag_config(tag, foreground=color)
143 |
144 | # --- REMOVED call to create_analysis_info_ui ---
145 |
146 | return log_frame
147 |
148 | # --- NEW/MODIFIED Analysis Info UI for Sidebar ---
149 | def create_analysis_info_ui(master_frame: ctk.CTk, app_instance):
150 | """Creates the informational section for the sidebar with a clickable link."""
151 | # The master_frame is now the sidebar frame passed from main_window
152 | # No need to create another frame inside unless needed for padding/structure
153 | master_frame.configure(fg_color="transparent") # Make sidebar background transparent if desired
154 |
155 | section_label = ctk.CTkLabel(master_frame, text="3. Data Analysis Tip", font=ctk.CTkFont(weight="bold"))
156 | section_label.pack(pady=(5, 5), padx=INNER_PAD_X, anchor='w') # Use pack directly into sidebar frame
157 |
158 | # --- Text and Link Handling ---
159 | # Define parts of the text and the URL
160 | text_part1 = "After downloading, analyze the 'output_*.txt' files using RAG tools.\nA recommended tool is Google's NotebookLM:"
161 | url = "https://notebooklm.google.com/"
162 | text_part2 = "\nUpload the files there to ask questions about the content. Feel free to explore other tools."
163 |
164 | # Set a wrap length appropriate for the sidebar width (adjust if needed)
165 | sidebar_wrap_length = 180
166 |
167 | # Create label for the text before the link
168 | label_part1 = ctk.CTkLabel(
169 | master_frame,
170 | text=text_part1,
171 | justify=ctk.LEFT,
172 | anchor='w',
173 | wraplength=sidebar_wrap_length
174 | )
175 | label_part1.pack(pady=(0, 2), padx=INNER_PAD_X, fill='x')
176 |
177 | # Create the clickable link label
178 | link_label = ctk.CTkLabel(
179 | master_frame,
180 | text=url,
181 | text_color="cornflowerblue", # Standard link color
182 | cursor="hand2", # Change cursor on hover
183 | justify=ctk.LEFT,
184 | anchor='w',
185 | wraplength=sidebar_wrap_length
186 | )
187 | link_label.pack(pady=2, padx=INNER_PAD_X, fill='x')
188 | # Bind left mouse click to open the URL
189 | link_label.bind("", lambda event: webbrowser.open_new(url))
190 | # Optional: Add underline
191 | # link_font = ctk.CTkFont(underline=True)
192 | # link_label.configure(font=link_font)
193 |
194 |
195 | # Create label for the text after the link
196 | label_part2 = ctk.CTkLabel(
197 | master_frame,
198 | text=text_part2,
199 | justify=ctk.LEFT,
200 | anchor='w',
201 | wraplength=sidebar_wrap_length
202 | )
203 | label_part2.pack(pady=(2, 5), padx=INNER_PAD_X, fill='x')
204 |
205 | # Return the master_frame (sidebar) itself, although not strictly needed
206 | return master_frame
--------------------------------------------------------------------------------
/src/scraper/__init__.py:
--------------------------------------------------------------------------------
1 | # src/scraper/__init__.py
2 | # This file makes 'scraper' a Python package.
3 | from .core_logic import run_scraping
--------------------------------------------------------------------------------
/src/scraper/core_logic.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re # Import re
3 | import threading
4 | import time
5 | from datetime import date, datetime
6 | from pathlib import Path # Use pathlib for path operations
7 | from typing import List, Dict, Optional, TextIO, Tuple, Callable, Any # Added Any
8 |
9 | # Import client and models from the sibling package
10 | try:
11 | from my_telegram_scrapper import SimpleScraperClient, SimpleTgPost, ScrapedPage
12 | except ImportError as e:
13 | # This error should ideally be caught at the application entry point,
14 | # but raise it here too for clarity if this module is used independently.
15 | raise ImportError("Could not import from 'my_telegram_scrapper'. Is it installed or in PYTHONPATH?") from e
16 |
17 | # Import configuration and utilities
18 | from src.config import CUTOFF_DATE
19 | from src.utils.file_utils import archive_old_output_files, load_channels
20 |
21 | # --- Helper Functions ---
22 |
23 | def _determine_date_range(
24 | mode: str, target_date: Optional[date], start_date: Optional[date], end_date: Optional[date]
25 | ) -> Tuple[date, date, str]:
26 | """
27 | Determines the effective start and end dates for scraping based on the mode.
28 | Also generates a string describing the date criteria for logging.
29 |
30 | Returns:
31 | A tuple containing (effective_start_date, effective_end_date, log_date_info_string).
32 | Raises:
33 | ValueError if required dates for a mode are missing or invalid range.
34 | """
35 | log_date_info = ""
36 | # Default range is from CUTOFF_DATE up to today
37 | effective_start_date = CUTOFF_DATE
38 | effective_end_date = date.today()
39 |
40 | if mode in ['today', 'yesterday', 'specific_date']:
41 | if target_date is None:
42 | raise ValueError(f"Target date is required for mode '{mode}'.")
43 | # For single-date modes, start and end are the same
44 | effective_start_date = target_date
45 | effective_end_date = target_date
46 | log_date_info = f" for date {target_date.strftime('%Y-%m-%d')}"
47 | elif mode == 'date_range':
48 | if start_date is None or end_date is None:
49 | raise ValueError("Start and end dates are required for 'date_range' mode.")
50 | # Ensure range start is not before the absolute cutoff
51 | effective_start_date = max(start_date, CUTOFF_DATE)
52 | effective_end_date = end_date
53 | log_date_info = f" for range {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
54 | # Add effective range info if start date was adjusted by cutoff
55 | if start_date < CUTOFF_DATE:
56 | log_date_info += f" (effective start: {effective_start_date.strftime('%Y-%m-%d')})"
57 | elif mode == 'all':
58 | # Uses the default range (CUTOFF_DATE to today)
59 | log_date_info = f" (since {CUTOFF_DATE.strftime('%Y-%m-%d')})"
60 | # effective_start_date and effective_end_date remain as defaults
61 |
62 | # Final validation: end date should not be before start date (can happen with cutoff adjustment)
63 | if effective_end_date < effective_start_date:
64 | raise ValueError(f"Effective end date ({effective_end_date.strftime('%Y-%m-%d')}) cannot be before effective start date ({effective_start_date.strftime('%Y-%m-%d')}).")
65 |
66 | return effective_start_date, effective_end_date, log_date_info
67 |
68 |
69 | def _write_post_to_file(handle: TextIO, channel: str, post: SimpleTgPost):
70 | """Formats and writes a single post to an open file handle."""
71 | try:
72 | post_content = post.content or "[No text content]"
73 | # Clean potential multiple newlines or excessive whitespace
74 | post_content = re.sub(r'\s{2,}', ' ', post_content).strip() # Use re.sub
75 |
76 | post_url_str = post.post_url or "[No URL]"
77 | post_time_str = post.timestamp.strftime('%H:%M:%S') if post.timestamp else "[No Time]"
78 | # Format: ChannelName | URL (HH:MM:SS) : Content
79 | post_info = f"{channel} | {post_url_str} ({post_time_str}) : {post_content}\n"
80 | handle.write(post_info)
81 | except Exception as e:
82 | # Log error but don't stop the whole process for one write failure
83 | # Ideally use log_callback if available, else print
84 | print(f"Error writing post {post.post_url} for channel {channel}: {e}")
85 |
86 | def _get_output_file_handle(
87 | post_date_str: str, output_dir: Path, base_list_name: str,
88 | open_files: Dict[str, TextIO], output_files_created: List[Path], log_callback: Callable
89 | ) -> Optional[TextIO]:
90 | """Gets or creates the file handle for a specific date."""
91 | if post_date_str in open_files:
92 | return open_files[post_date_str]
93 |
94 | file_path = output_dir / f"output_{base_list_name}_{post_date_str}.txt"
95 | try:
96 | # Use 'a' mode (append)
97 | handle = open(file_path, "a", encoding="utf-8")
98 | # Write header only if the file is newly created (or empty)
99 | if file_path.stat().st_size == 0:
100 | handle.write(f"### Posts from {post_date_str} (List: {base_list_name})\n\n")
101 | open_files[post_date_str] = handle
102 | # Track the file path if newly opened/created
103 | if file_path not in output_files_created:
104 | output_files_created.append(file_path)
105 | log_callback(f"Opened output file: {file_path.name}", "DEBUG") # Debug level log
106 | return handle
107 | except OSError as e:
108 | log_callback(f"Failed to open/write header to output file {file_path}: {e}", "ERROR")
109 | return None
110 |
111 |
112 | def _process_scraped_post(
113 | post: SimpleTgPost, channel: str, mode: str,
114 | effective_start_date: date, effective_end_date: date,
115 | output_dir: Path, base_list_name: str,
116 | open_files: Dict[str, TextIO], output_files_created: List[Path],
117 | all_posts_for_specific_date: List[Tuple[str, SimpleTgPost]],
118 | log_callback: Callable
119 | ) -> bool:
120 | """
121 | Checks if a post matches date criteria and writes it to the appropriate file/list.
122 | Returns True if the post was processed, False otherwise.
123 | """
124 | if not post.timestamp:
125 | return False # Cannot process without a timestamp
126 |
127 | current_post_date = post.timestamp.date()
128 |
129 | # --- Date Filtering ---
130 | # 1. Check against absolute CUTOFF_DATE (skip if older)
131 | if current_post_date < CUTOFF_DATE:
132 | return False
133 |
134 | # 2. Check against mode-specific date range
135 | is_within_target_range = False
136 | if mode == 'all':
137 | # Already passed CUTOFF check, so it's valid for 'all'
138 | is_within_target_range = True
139 | elif mode == 'date_range':
140 | # Check if within the effective start/end dates
141 | is_within_target_range = (effective_start_date <= current_post_date <= effective_end_date)
142 | elif mode in ['today', 'yesterday', 'specific_date']:
143 | # Check if it matches the single target date (start date = end date = target date)
144 | is_within_target_range = (current_post_date == effective_start_date)
145 |
146 | if not is_within_target_range:
147 | return False # Post date does not match the required criteria for the mode
148 |
149 | # --- Process Matching Post ---
150 | processed = False
151 | if mode == 'all' or mode == 'date_range':
152 | # Write directly to the file corresponding to the post's date
153 | post_date_str = current_post_date.strftime("%Y-%m-%d")
154 | handle = _get_output_file_handle(
155 | post_date_str, output_dir, base_list_name,
156 | open_files, output_files_created, log_callback
157 | )
158 | if handle:
159 | _write_post_to_file(handle, channel, post)
160 | processed = True
161 | # else: Error getting handle was logged by _get_output_file_handle
162 |
163 | elif mode in ['today', 'yesterday', 'specific_date']:
164 | # Collect posts for single-date modes to write later (allows sorting)
165 | all_posts_for_specific_date.append((channel, post)) # Store channel name with post
166 | processed = True
167 |
168 | return processed
169 |
170 |
171 | def _scrape_single_channel(
172 | client: SimpleScraperClient, channel: str, mode: str,
173 | effective_start_date: date, effective_end_date: date,
174 | log_callback: Callable, stop_event: threading.Event,
175 | output_dir: Path, base_list_name: str,
176 | open_files: Dict[str, TextIO], output_files_created: List[Path],
177 | all_posts_for_specific_date: List[Tuple[str, SimpleTgPost]]
178 | ) -> int:
179 | """
180 | Scrapes posts for a single channel, handling pagination and date filtering.
181 |
182 | Returns:
183 | The number of posts successfully processed for this channel matching criteria.
184 | """
185 | next_page_token: Optional[str] = None
186 | pages_checked = 0
187 | processed_posts_count = 0 # Posts processed *for this channel* matching criteria
188 | stop_channel_pagination = False
189 | last_oldest_date_on_page: Optional[date] = None # Track oldest date seen
190 |
191 | log_callback(f"Starting channel: {channel}", "DEBUG")
192 |
193 | while not stop_channel_pagination:
194 | if stop_event.is_set():
195 | log_callback(f"Stop signal received, interrupting channel {channel}.", "WARN")
196 | break # Break inner loop (pagination)
197 |
198 | pages_checked += 1
199 | log_callback(f" Fetching page {pages_checked} for {channel} (Token: {next_page_token or 'None'})...", "DEBUG")
200 |
201 | # --- Fetch Page ---
202 | try:
203 | page_data = client.get_channel_page(channel, before_token=next_page_token)
204 | except Exception as fetch_e: # Catch errors during fetch/parse at client level
205 | log_callback(f"Error fetching/parsing page {pages_checked} for {channel}: {fetch_e}", "ERROR")
206 | stop_channel_pagination = True # Stop processing this channel on error
207 | continue # Skip to next channel or finish
208 |
209 | if not page_data or not page_data.posts:
210 | log_callback(f" No more posts found or page error for {channel} on page {pages_checked}.", "INFO")
211 | stop_channel_pagination = True
212 | continue
213 |
214 | # --- Process Posts on Page ---
215 | posts_on_page = page_data.posts
216 | oldest_post_date_this_page: Optional[date] = None
217 | posts_processed_this_page = 0
218 |
219 | for post in posts_on_page:
220 | if post.timestamp:
221 | current_post_date = post.timestamp.date()
222 | # Update oldest date seen on this specific page
223 | if oldest_post_date_this_page is None or current_post_date < oldest_post_date_this_page:
224 | oldest_post_date_this_page = current_post_date
225 |
226 | # Process the post (checks dates, writes/collects)
227 | if _process_scraped_post(
228 | post, channel, mode, effective_start_date, effective_end_date,
229 | output_dir, base_list_name, open_files, output_files_created,
230 | all_posts_for_specific_date, log_callback
231 | ):
232 | posts_processed_this_page += 1
233 |
234 | if posts_processed_this_page > 0:
235 | log_callback(f" Processed {posts_processed_this_page} matching posts from page {pages_checked}.", "DEBUG")
236 | # Log if posts existed but none matched criteria for this specific page
237 | elif posts_on_page:
238 | log_callback(f" No posts on page {pages_checked} matched date criteria for mode '{mode}'.", "DEBUG")
239 |
240 | processed_posts_count += posts_processed_this_page
241 | last_oldest_date_on_page = oldest_post_date_this_page # Store for pagination logic
242 |
243 | # --- Pagination Stop Conditions ---
244 | next_page_token = page_data.next_page_token
245 | if not next_page_token:
246 | log_callback(f" End of channel history reached for {channel} (no next page token).", "INFO")
247 | stop_channel_pagination = True
248 | continue
249 |
250 | # Stop if the oldest post found on the page is before the required start date
251 | if last_oldest_date_on_page:
252 | if last_oldest_date_on_page < effective_start_date:
253 | log_callback(f" Oldest post on page ({last_oldest_date_on_page.strftime('%Y-%m-%d')}) is before target start date ({effective_start_date.strftime('%Y-%m-%d')}). Stopping pagination for {channel}.", "INFO")
254 | stop_channel_pagination = True
255 | continue
256 | # Add a safety break if pages_checked gets excessively high?
257 | if pages_checked > 500: # Arbitrary limit to prevent infinite loops on weird pages
258 | log_callback(f"Warning: Exceeded 500 pages for channel {channel}. Stopping pagination.", "WARN")
259 | stop_channel_pagination = True
260 | continue
261 |
262 | # Optional: Short delay between page requests
263 | # time.sleep(0.1) # Be mindful of rate limiting
264 |
265 | log_callback(f"Finished channel {channel}. Found {processed_posts_count} matching posts.", "INFO")
266 | return processed_posts_count
267 |
268 | # --- Main Scraping Function ---
269 | def scrape_channels(
270 | channellist_file: str, mode: str,
271 | target_date: Optional[date], start_date: Optional[date], end_date: Optional[date],
272 | log_callback: Callable, stop_event: threading.Event,
273 | output_dir: Path
274 | ) -> List[Path]:
275 | """
276 | Scrapes posts from channels listed in a file based on mode and date criteria.
277 |
278 | Args:
279 | channellist_file: Path to the file containing channel names/URLs.
280 | mode: Scraping mode ('today', 'yesterday', 'specific_date', 'date_range', 'all').
281 | target_date: The specific date for single-date modes.
282 | start_date: Start date for range mode.
283 | end_date: End date for range mode.
284 | log_callback: Function to call for logging messages to the GUI/console.
285 | stop_event: Threading event to signal stopping the process.
286 | output_dir: Path object for the directory to save output files.
287 |
288 | Returns:
289 | A list of Path objects for the output files created or updated.
290 | Raises:
291 | ValueError, FileNotFoundError, RuntimeError on critical errors.
292 | """
293 | output_files_created: List[Path] = []
294 | # Get the base name of the channel list file (e.g., "proRuChannels")
295 | base_list_name = Path(channellist_file).stem
296 |
297 | try:
298 | effective_start_date, effective_end_date, log_date_info = _determine_date_range(
299 | mode, target_date, start_date, end_date
300 | )
301 | except ValueError as e:
302 | log_callback(f"Date range error: {e}", "ERROR")
303 | raise e # Re-raise for the caller (GUI thread)
304 |
305 | log_callback(f"Starting scraping process. Mode: '{mode}'{log_date_info}", "INFO")
306 | log_callback(f"Effective date range: {effective_start_date.strftime('%Y-%m-%d')} to {effective_end_date.strftime('%Y-%m-%d')}", "DEBUG")
307 |
308 | # Load channels (handles its own file errors)
309 | channels = load_channels(channellist_file, log_callback)
310 | if not channels: # load_channels should raise error if file empty/not found, but double-check
311 | log_callback("Channel list is empty or could not be loaded.", "ERROR")
312 | raise ValueError("Channel list is empty.") # Raise error to stop process
313 |
314 | # Dictionary to hold open file handles {date_str: file_handle} for range/all modes
315 | open_files: Dict[str, TextIO] = {}
316 | # List to store posts for single-date modes before writing
317 | all_posts_for_specific_date: List[Tuple[str, SimpleTgPost]] = []
318 | total_processed_posts = 0
319 |
320 | try:
321 | # Use the client as a context manager
322 | with SimpleScraperClient() as client:
323 | log_callback(f"Processing {len(channels)} channels from {Path(channellist_file).name}...", "INFO")
324 | for i, channel in enumerate(channels):
325 | if stop_event.is_set():
326 | log_callback("Stop signal received. Aborting channel processing.", "WARN")
327 | break # Break outer loop (channel iteration)
328 |
329 | log_callback(f"--- Channel {i+1}/{len(channels)}: {channel} ---", "INFO")
330 |
331 | # Scrape the current channel
332 | processed_count = _scrape_single_channel(
333 | client, channel, mode, effective_start_date, effective_end_date,
334 | log_callback, stop_event, output_dir, base_list_name,
335 | open_files, output_files_created, all_posts_for_specific_date
336 | )
337 | total_processed_posts += processed_count
338 |
339 | except Exception as client_error:
340 | # Catch unexpected errors during client usage or scraping loop
341 | log_callback(f"Critical error during scraping: {client_error}", "ERROR")
342 | # Raise a runtime error to signal failure to the calling thread
343 | raise RuntimeError(f"Scraping failed due to an unexpected error: {client_error}") from client_error
344 | finally:
345 | # --- Cleanup: Close all files opened in range/all mode ---
346 | if open_files:
347 | log_callback(f"Closing {len(open_files)} output files...", "INFO")
348 | closed_count = 0
349 | for date_str, handle in open_files.items():
350 | try:
351 | if handle and not handle.closed:
352 | handle.close()
353 | closed_count += 1
354 | except Exception as close_e:
355 | log_callback(f"Error closing file for date {date_str}: {close_e}", "ERROR")
356 | log_callback(f"Closed {closed_count} files.", "DEBUG")
357 |
358 | # --- Write collected posts for single-date modes ---
359 | if mode in ['today', 'yesterday', 'specific_date'] and all_posts_for_specific_date:
360 | if target_date is None:
361 | # This shouldn't happen if date validation passed, but check defensively
362 | log_callback("Cannot write single-date file: Target date is missing.", "ERROR")
363 | else:
364 | output_file_path = output_dir / f"output_{base_list_name}_{target_date.strftime('%Y-%m-%d')}.txt"
365 | log_callback(f"Writing {len(all_posts_for_specific_date)} collected posts to {output_file_path.name}...", "INFO")
366 | try:
367 | # Sort posts by timestamp before writing for chronological order
368 | all_posts_for_specific_date.sort(
369 | key=lambda item: item[1].timestamp or datetime.min # Sort by post timestamp
370 | )
371 | # Use 'w' mode (write/overwrite) for single-date files
372 | with open(output_file_path, "w", encoding="utf-8") as outfile:
373 | outfile.write(f"### Posts from {target_date.strftime('%Y-%m-%d')} (List: {base_list_name})\n\n")
374 | for channel_name, post in all_posts_for_specific_date:
375 | _write_post_to_file(outfile, channel_name, post)
376 |
377 | # Add the file path to the list of created files if not already present
378 | if output_file_path not in output_files_created:
379 | output_files_created.append(output_file_path)
380 | log_callback(f"Successfully wrote single-date file: {output_file_path.name}", "INFO")
381 | except OSError as write_e:
382 | log_callback(f"Failed to write output file {output_file_path.name}: {write_e}", "ERROR")
383 | # Optionally remove from created list if write failed partway?
384 | if output_file_path in output_files_created:
385 | output_files_created.remove(output_file_path)
386 |
387 | # --- Final Logging ---
388 | total_files = len(output_files_created)
389 | if stop_event.is_set():
390 | log_callback(f"Scraping interrupted. Processed {total_processed_posts} posts into {total_files} files before stopping.", "WARN")
391 | elif total_processed_posts == 0: # Check if *any* posts matching criteria were found across all channels
392 | log_callback(f"Scraping finished. No posts found matching the specified criteria{log_date_info}.", "INFO")
393 | else:
394 | log_callback(f"Scraping finished successfully. Processed {total_processed_posts} posts into {total_files} files.", "INFO")
395 |
396 | return output_files_created
397 |
398 |
399 | # --- Runner Function (called by the GUI thread) ---
400 | def run_scraping(
401 | channellist_file: str, mode: str,
402 | target_date: Optional[date], start_date: Optional[date], end_date: Optional[date],
403 | log_callback: Callable[[str, str], None], stop_event: threading.Event, base_dir: str
404 | ) -> List[str]:
405 | """
406 | Entry point called by the GUI thread. Handles setup (archiving) and calls the main scraping logic.
407 |
408 | Args:
409 | base_dir: The application's base directory (string). Other args are as in scrape_channels.
410 | log_callback: Adjusted signature for level
411 |
412 | Returns:
413 | A list of string paths for the output files created or updated.
414 | Raises:
415 | Exceptions caught during setup or scraping, to be handled by the calling GUI thread.
416 | """
417 | output_files: List[Path] = [] # List of Path objects
418 | base_dir_path = Path(base_dir)
419 | output_dir = base_dir_path # Output files go directly into the base directory
420 |
421 | try:
422 | # 1. Archive existing output files before starting
423 | archive_old_output_files(str(base_dir_path), log_callback) # Pass base_dir as string if util expects it
424 |
425 | if stop_event.is_set():
426 | log_callback("Process stopped during archiving phase.", "WARN")
427 | return [] # Return empty list if stopped early
428 |
429 | # 2. Run the main scraping function
430 | log_callback("Archiving complete. Starting channel processing...", "INFO")
431 | output_files = scrape_channels(
432 | channellist_file=channellist_file,
433 | mode=mode,
434 | target_date=target_date,
435 | start_date=start_date,
436 | end_date=end_date,
437 | log_callback=log_callback, # Pass the callback directly
438 | stop_event=stop_event,
439 | output_dir=output_dir # Pass the Path object
440 | )
441 | # Convert Path objects back to strings for the GUI handler if needed
442 | return [str(f) for f in output_files]
443 |
444 | except (FileNotFoundError, ValueError, RuntimeError, NameError, ImportError) as e:
445 | # Log errors originating from setup or scraping logic
446 | log_callback(f"Scraping process failed: {e}", "ERROR")
447 | # Re-raise the exception to be caught by the calling thread (GUI)
448 | # This allows the GUI to show the specific error message.
449 | raise e
450 | except Exception as e:
451 | # Log unexpected critical errors during the overall process
452 | log_callback(f"An unexpected critical error occurred in run_scraping: {e}", "ERROR")
453 | # Optionally log traceback here
454 | # import traceback
455 | # log_callback(traceback.format_exc(), "ERROR")
456 | # Wrap in a RuntimeError for consistent handling by the GUI
457 | raise RuntimeError(f"Unexpected error during scraping execution: {e}") from e
--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # This file makes 'utils' a Python package.
2 | from .file_utils import archive_old_output_files, load_channels
--------------------------------------------------------------------------------
/src/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | import shutil
4 | import random
5 | import time
6 | import re
7 | from pathlib import Path # Use pathlib for easier path operations
8 | from typing import List, Callable, Any # Added Any
9 |
10 | from src.config import ARCHIVE_DIR_NAME
11 |
12 | def archive_old_output_files(base_dir_str: str, log_callback: Callable[[str, str], None]):
13 | """
14 | Moves existing output_*.txt files from the base directory to an archive subfolder.
15 | """
16 | base_dir = Path(base_dir_str)
17 | archive_path = base_dir / ARCHIVE_DIR_NAME
18 |
19 | try:
20 | # Create archive directory if it doesn't exist
21 | archive_created = False
22 | if not archive_path.exists():
23 | archive_path.mkdir(parents=True, exist_ok=True)
24 | archive_created = True
25 | if archive_created and archive_path.exists(): # Check creation
26 | log_callback(f"Created archive directory: {archive_path}", "INFO")
27 |
28 | # Find output files in the base directory using pathlib's glob
29 | # Pattern: output_ followed by any characters until .txt
30 | output_files = list(base_dir.glob("output_*.txt"))
31 |
32 | if not output_files:
33 | log_callback("No previous output files found to archive.", "INFO")
34 | return
35 |
36 | log_callback(f"Found {len(output_files)} output file(s) to archive...", "INFO")
37 | archived_count = 0
38 | for file_path in output_files:
39 | try:
40 | # Create a unique archive filename
41 | base_name = file_path.stem # Name without extension
42 | timestamp_str = time.strftime("%Y%m%d_%H%M%S")
43 | random_num = random.randint(1000, 9999)
44 | # Keep the original extension (.txt)
45 | archive_name = f"{base_name}_{timestamp_str}_{random_num}{file_path.suffix}"
46 | destination_path = archive_path / archive_name
47 |
48 | # Move the file
49 | shutil.move(str(file_path), str(destination_path))
50 | log_callback(f" Archived {file_path.name} to {archive_name}", "DEBUG") # Debug level
51 | archived_count += 1
52 | except OSError as e:
53 | log_callback(f"Error archiving file {file_path.name}: {e}", "ERROR")
54 | except Exception as e: # Catch other potential errors during move/naming
55 | log_callback(f"Unexpected error archiving {file_path.name}: {e}", "ERROR")
56 |
57 | log_callback(f"Archiving complete. Moved {archived_count} file(s).", "INFO")
58 |
59 | except OSError as e:
60 | log_callback(f"Error creating or accessing archive directory {archive_path}: {e}", "ERROR")
61 | except Exception as e:
62 | log_callback(f"General error during archiving process: {e}", "ERROR")
63 |
64 |
65 | def load_channels(channellist_file: str, log_callback: Callable[[str, str], None]) -> List[str]:
66 | """
67 | Loads and validates channel names/URLs from the given text file.
68 | Extracts the channel username (part after the last '/').
69 |
70 | Returns:
71 | A list of valid channel usernames.
72 | Raises:
73 | FileNotFoundError if the file doesn't exist.
74 | ValueError if the file contains no valid channel names.
75 | RuntimeError for other read errors.
76 | """
77 | channels: List[str] = []
78 | file_path = Path(channellist_file)
79 |
80 | if not file_path.is_file():
81 | log_callback(f"Channel list file not found: {channellist_file}", "ERROR")
82 | raise FileNotFoundError(f"Channel list file not found: {channellist_file}")
83 |
84 | try:
85 | with file_path.open("r", encoding="utf-8") as infile:
86 | for line_num, line in enumerate(infile, 1):
87 | original_line = line # Keep original for logging errors
88 | line = line.strip()
89 | if not line or line.startswith('#'): # Skip empty lines and comments
90 | continue
91 |
92 | # Remove trailing slash if present
93 | line = line.rstrip('/')
94 |
95 | # Extract the part after the last slash (potential channel name)
96 | if '/' in line:
97 | # Takes the part after the last '/'
98 | channel_name = line.rsplit('/', 1)[-1]
99 | else:
100 | # Assume the whole line is the channel name if no slash
101 | channel_name = line
102 |
103 | # Basic validation: non-empty, reasonable characters (alphanumeric, underscore)
104 | # Avoid full URLs mistakenly treated as names
105 | # Telegram usernames are >= 5 chars, start with letter, contain letters, numbers, underscores
106 | if channel_name and re.match(r'^[a-zA-Z][a-zA-Z0-9_]{4,}$', channel_name):
107 | if channel_name not in channels: # Avoid duplicates
108 | channels.append(channel_name)
109 | else:
110 | log_callback(f"Skipping invalid or malformed channel entry on line {line_num}: '{original_line.strip()}' -> extracted '{channel_name}'", "WARN")
111 |
112 | log_callback(f"Loaded {len(channels)} unique, valid channel names from {file_path.name}.", "INFO")
113 |
114 | if not channels:
115 | log_callback("The channel list file is empty or contains no valid channel names.", "ERROR")
116 | raise ValueError(f"No valid channel names found in {file_path.name}.")
117 |
118 | return channels
119 |
120 | except OSError as e:
121 | log_callback(f"Error reading channel list file {channellist_file}: {e}", "ERROR")
122 | raise RuntimeError(f"Error reading channel list file: {e}") from e
123 | except Exception as e:
124 | log_callback(f"Unexpected error loading channels from {channellist_file}: {e}", "ERROR")
125 | raise RuntimeError(f"Unexpected error loading channels: {e}") from e
--------------------------------------------------------------------------------