├── .github └── workflows │ └── release.yml ├── .gitignore ├── README.md ├── channelslists ├── OsintChannels.txt ├── knownPeople.txt ├── newsAgregators.txt ├── proRuChannels.txt └── proUkrChannels.txt ├── getTelegram.py ├── my_telegram_scrapper ├── __init__.py ├── client.py ├── models.py └── parser.py ├── requirements.bat ├── requirements.txt ├── run.bat └── src ├── config.py ├── gui ├── __init__.py ├── event_handlers.py ├── main_window.py └── ui_components.py ├── scraper ├── __init__.py └── core_logic.py └── utils ├── __init__.py └── file_utils.py /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Build and Release (PyInstaller) 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - 'v*' 9 | 10 | permissions: 11 | contents: write 12 | 13 | jobs: 14 | build-and-release: 15 | runs-on: windows-latest 16 | 17 | steps: 18 | - name: Check out the code 19 | uses: actions/checkout@v3 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v4 23 | with: 24 | python-version: '3.9' 25 | 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install -r requirements.txt 30 | pip install pyinstaller 31 | 32 | - name: Build with PyInstaller (--onedir) 33 | run: | 34 | pyinstaller --onedir --noconfirm --clean ` 35 | --add-data "channelslists;channelslists" ` 36 | --add-data "my_telegram_scrapper;my_telegram_scrapper" ` 37 | --add-data "src;src" ` 38 | getTelegram.py 39 | 40 | - name: Prepare Release Artifact Name 41 | id: artifact_name 42 | run: | 43 | $tagName = "${{ github.ref_name }}" 44 | $zipFileName = "getTelegram-${tagName}.zip" 45 | echo "zip_file_name=$zipFileName" | Out-File -FilePath $env:GITHUB_OUTPUT -Encoding utf8 -Append 46 | shell: pwsh 47 | 48 | - name: Create ZIP Archive 49 | run: Compress-Archive -Path dist/getTelegram/* -DestinationPath dist/${{ steps.artifact_name.outputs.zip_file_name }} 50 | shell: pwsh 51 | 52 | - name: Create Release 53 | if: startsWith(github.ref, 'refs/tags/') 54 | id: create_release 55 | uses: actions/create-release@v1 56 | with: 57 | tag_name: ${{ github.ref_name }} 58 | release_name: Release ${{ github.ref_name }} 59 | draft: false 60 | prerelease: false 61 | env: 62 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 63 | 64 | - name: Upload Release Asset (ZIP) 65 | if: startsWith(github.ref, 'refs/tags/') 66 | uses: actions/upload-release-asset@v1 67 | with: 68 | upload_url: ${{ steps.create_release.outputs.upload_url }} 69 | asset_path: dist/${{ steps.artifact_name.outputs.zip_file_name }} 70 | asset_name: ${{ steps.artifact_name.outputs.zip_file_name }} 71 | asset_content_type: application/zip 72 | env: 73 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # --- Python --- 2 | # Byte-compiled / optimized files / DLLs 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # Distribution / packaging 8 | build/ 9 | dist/ 10 | *.egg-info/ 11 | *.spec # PyInstaller spec files 12 | 13 | # Virtual environments (common names) 14 | .env 15 | .venv 16 | env/ 17 | venv/ 18 | ENV/ 19 | VENV/ 20 | */.venv/ 21 | */env/ 22 | */venv/ 23 | 24 | # IDE / Editor / OS specific files 25 | .idea/ # IntelliJ IDEA / PyCharm 26 | .vscode/ # Visual Studio Code 27 | *.swp # Vim swap files 28 | .DS_Store # macOS 29 | Thumbs.db # Windows 30 | 31 | # Log files (if generated) 32 | *.log 33 | 34 | # --- Specific to the TelegramOSINTPolo project --- 35 | 36 | # Generated output files with posts 37 | # Ignores files starting with "output_" and ending with ".txt" 38 | output_*.txt 39 | 40 | # Archive directory for old output files 41 | # Ignores the entire 'archive' directory in the project root 42 | /archive/* 43 | 44 | # User channel list files in the channelslists directory 45 | # Ignores all .txt files inside channelslists. 46 | # Consider leaving an example file in the repository (e.g., example.txt) 47 | # and unignoring it using: !channelslists/example.txt 48 | channelslists/*.txt 49 | 50 | # --- Optionally --- 51 | # Files with secrets or sensitive configuration (if you add any) 52 | # e.g. secrets.ini, config.yaml -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TODO: 2 | * LIST OF CHANNELS ARE RANDOM FOR NOW - TODO REVIEW their correct political BIAS 3 | * Workflow for deployment .exe files requires fix. I dont have time for that. Maybe later. Sorry 4 | 5 | # User Manual for the "Download Telegram Posts" Application 6 | 7 | This application is used to download posts from Telegram channels and save them to text files. 8 | 9 | ## 1. Preparing the Channel List: 10 | 11 | * **Recommendation:** Create *two* separate channel lists in `.txt` files. You can divide them thematically (e.g., "news", "sport", "technology") or by *political bias* (e.g., "pro-Russian", "pro-Ukrainian", "neutral"). Such division will facilitate later analysis. 12 | * Each channel should be on a separate line, either as a full URL (e.g., `https://t.me/channel_name`) or just the name (e.g., `channel_name`). ***Do not* add a '/' character at the end of the channel name**. 13 | * If you have channels in the full URL format, the program will automatically remove everything before the last '/' and retrieve only the channel name. 14 | * The program includes a sample list of over 160 channels, but these are just examples. You need to create your actual list yourself[cite: 2, 3, 4, 5, 6]. 15 | 16 | ![{AD1F8859-3306-4393-9B14-A80DD1DE3A03}](https://github.com/user-attachments/assets/36ba23a4-fad8-4935-9bdf-cbc9ccbe3f8a) 17 | 18 | 19 | ## 2. Launching the Application: 20 | 21 | * After launching the application, you will see the main program window. 22 | {A47CC81A-BCCB-4408-B63E-3492D3A2BB2C} 23 | 24 | 25 | 26 | 27 | ## 3. Configuration: 28 | 29 | * **Select Channel File:** 30 | * Click the "Browse..." button. 31 | * In the dialog window, select the `.txt` file containing the channel list. 32 | * The path to the selected file will appear in the text field. 33 | 34 | * **Select Date:** 35 | * Choose the desired date or date range using the provided options (Specific Date, Date Range, Today, Yesterday, All). 36 | 37 | ## 4. Downloading Posts: 38 | 39 | * Click the relevant "Download..." button based on your date selection. 40 | * The application will start downloading posts from the channels listed in the selected file. 41 | * **Logs (what's happening):** In the lower part of the window, in the "Logs" field, informational messages about the progress will be displayed: 42 | * The name of the currently downloaded channel. 43 | * The number of posts downloaded from that channel. 44 | * The content of the downloaded posts (including the link). 45 | * Any error messages. 46 | * The download process may take a while, depending on the number of channels and posts. Do not close the program window until the process is complete. You can use the "STOP SCRAPING" button if needed. 47 | 48 | ## 5. Saving Results: 49 | 50 | * After the download is complete (or stopped), the application automatically saves the posts to `.txt` file(s). 51 | * The output filename format is: `output_FILENAME-WITH-CHANNELS_YYYY-MM-DD.txt`, where: 52 | * `FILENAME-WITH-CHANNELS` is the name of the file you selected with the channel list (without the `.txt` extension). 53 | * `YYYY-MM-DD` is the date from which the posts were downloaded (year-month-day). For range or 'all' modes, multiple files might be created/updated. 54 | * The output file(s) will be created in the same directory where the application's executable (.exe) or Python script is located. Existing `output_*.txt` files will be moved to an `archive` subfolder before a new scrape starts. 55 | * A success, interruption, or error message will appear in a pop-up window. 56 | 57 | ## 6. Data Analysis with NotebookLM: 58 | 59 | * **Key Step:** After downloading the data (the `output_*.txt` files), you can use Google's *NotebookLM* (https://notebooklm.google.com) to efficiently analyze the collected information. NotebookLM works like RAG (Retrieval-Augmented Generation), which means you can "talk" to your data. 60 | 61 | * There are two versions of the notebook. The free version is perfectly sufficient, but no one is stopping you from buying the Plus version. 62 | > In the NotebookLM version, you can have up to 100 notebooks, and each can contain up to 50 sources. Each source can contain up to half a million words. All users initially get 50 chat queries and can generate 3 audio summaries. 63 | > 64 | > If you upgrade to NotebookLM Plus, these limits increase at least 5-fold – to 500 notebooks and 300 sources per notebook. Daily query limits also increase – you will be able to ask up to 500 chat queries and generate 20 audio summaries each day. When sharing a notebook, the source limit does not change: both you and the people you share the notebook with can upload a maximum of 300 sources to it. 65 | 66 | ![{3BFABF80-1DF0-4C75-B817-88184E8B4240}](https://github.com/user-attachments/assets/5c66fa81-4d65-4c38-b97d-436fc4752983) 67 | 68 | * **How to use NotebookLM:** 69 | 1. Upload the downloaded `.txt` files to NotebookLM as sources. 70 | 2. NotebookLM will process these files and allow you to ask questions in natural language about their content. 71 | 3. You can ask for summaries, sentiment analysis, search for specific information, compare content from different channels, identify trends, and even generate new texts based on the downloaded data. 72 | 4. Use the notebook to ask questions about the uploaded files. 73 | 74 | ![{3BDC503A-D6C4-47C2-87C1-7E3E075F5138}](https://github.com/user-attachments/assets/8f2c9535-5d6a-4776-a7bc-a738fcde6578) 75 | 76 | 77 | * **Advantages of Analysis in NotebookLM (RAG):** 78 | * **Context:** NotebookLM analyzes your questions in the *context* of the uploaded data. Answers are based *directly* on information from the files, minimizing the risk of hallucinations (the language model inventing information). 79 | * **Precision:** You can refer to specific text fragments, making it easier to verify information and track sources. NotebookLM can indicate where a particular answer comes from. 80 | * **Efficiency:** You don't have to manually search through hundreds of posts. NotebookLM does it for you, saving your time and effort. 81 | * **Deeper Analysis:** Thanks to the ability to ask questions and generate summaries, you can gain much deeper insights into the data than with traditional analysis. You can discover hidden patterns, connections, and trends that might otherwise be missed. 82 | * **Interactivity:** NotebookLM allows dynamic interaction with data. You can modify your queries on the fly and get immediate answers. 83 | * **Security:** NotebookLM, using the uploaded files as its source of information, does not draw information from uncertain sources. 84 | 85 | ## Additional Notes: 86 | 87 | * Ensure you have a stable internet connection while downloading posts. 88 | * For a very large number of channels or posts, downloading may take longer. The 'Download All' option can be particularly time-consuming. 89 | * If an error occurs, check the message content in the "Logs" field and ensure the provided channel name is correct and the channel is publicly accessible via web view. 90 | * The program used the `accless-tg-scraper` library, which worked without using the official Telegram API by scraping the public web preview of channels. https://github.com/Kisspeace/accless-tg-scraper but after some considerations and understanding that updates are nowhere near I had to write my own scrapper from scratch but I still leave link to this repo to point where idea come from. 91 | 92 | ## REMEMBER THAT THE RESPONSIBILITY FOR VERIFYING SOURCES LIES SOLELY WITH YOU. THE NUMBERS NEXT TO THE TEXT (1) IN NOTEBOOKLM ARE LINKS TO QUOTATIONS USED BY THE LLM. THE OUTPUT FILES FROM THIS APPLICATION CONTAIN THE POST CONTENT AND A DIRECT LINK TO THE ORIGINAL POST (2) ON TELEGRAM FOR VERIFICATION. 93 | ![image](https://github.com/user-attachments/assets/3779eb4f-2f3a-4b82-a3e4-1170598bed5f) 94 | -------------------------------------------------------------------------------- /channelslists/OsintChannels.txt: -------------------------------------------------------------------------------- 1 | https://t.me/osintbees 2 | https://t.me/DeepStateUA 3 | https://t.me/rybar 4 | https://t.me/osint_69 5 | https://t.me/arrowsmap 6 | https://t.me/CITeam 7 | https://t.me/CIT_shellings 8 | https://t.me/radar_russia_monitor 9 | https://t.me/control_sigma -------------------------------------------------------------------------------- /channelslists/knownPeople.txt: -------------------------------------------------------------------------------- 1 | https://t.me/AleksandrSemchenko 2 | https://t.me/NeoficialniyBeZsonoV 3 | https://t.me/vrogov 4 | https://t.me/yzheleznyak 5 | https://t.me/olegtsarov 6 | https://t.me/zhivoff22 7 | https://t.me/SergeyKolyasnikov 8 | https://t.me/ASupersharij 9 | https://t.me/epoddubny 10 | https://t.me/khramov_alexander 11 | https://t.me/A_S_Sukonkin 12 | https://t.me/i_strelkov_2023 13 | https://t.me/IgorLinkChannel 14 | https://t.me/VGlagola 15 | https://t.me/RKadyrov_95 16 | https://t.me/bochkala_war 17 | https://t.me/filatovcorr 18 | https://t.me/margaritasimonyan 19 | https://t.me/garmaev_alexander 20 | https://t.me/Tsaplienko 21 | https://t.me/anatoly_nesmiyan 22 | https://t.me/nevzorovtv 23 | https://t.me/Sladkov_plus 24 | https://t.me/sashakots 25 | https://t.me/iistrelkov 26 | https://t.me/montian_official2 27 | https://t.me/wargonzo 28 | https://t.me/montyan2 29 | https://t.me/shevchenkomax_1 30 | https://t.me/strelkovii 31 | https://t.me/pgubarev 32 | https://t.me/BalitskyEV 33 | https://t.me/David_Arakhamia 34 | https://t.me/RSaponkov 35 | https://t.me/stanislav_osman 36 | https://t.me/rubaevCIS 37 | https://t.me/ryabseva_zhanna 38 | https://t.me/agurulev 39 | https://t.me/yurasumy 40 | https://t.me/akashevarova 41 | https://t.me/butrimov 42 | https://t.me/tmelnychuk 43 | https://t.me/ButusovPlus 44 | https://t.me/mardanaka 45 | https://t.me/alexandrshtefanov 46 | https://t.me/aleksandr_skif 47 | https://t.me/romanov_92 48 | https://t.me/JokerDPR 49 | https://t.me/vladlentatarsky 50 | https://t.me/a_shtirlitz 51 | https://t.me/JusuMakonis 52 | https://t.me/shouvalov 53 | https://t.me/daniel_orlov 54 | https://t.me/osirskiy -------------------------------------------------------------------------------- /channelslists/newsAgregators.txt: -------------------------------------------------------------------------------- 1 | https://t.me/ToBeOrChat 2 | https://t.me/ejdailyru 3 | https://t.me/radiotrek 4 | https://t.me/supernova_plus 5 | https://t.me/objectivetv 6 | https://t.me/InsiderUA_UK 7 | https://t.me/bbc_khm 8 | https://t.me/uniannet 9 | https://t.me/nexta_live 10 | https://t.me/Starkon_city 11 | https://t.me/khmlv 12 | https://t.me/bbbreaking 13 | https://t.me/sotavisionmedia 14 | https://t.me/roy_tv_mk 15 | https://t.me/astrapress 16 | https://t.me/UaOnlii 17 | https://t.me/anna_news 18 | https://t.me/zvizdecmanhustu 19 | https://t.me/BILD_Russian 20 | https://t.me/agentstvonews 21 | https://t.me/bihusinfo 22 | https://t.me/vchkogpu 23 | https://t.me/russianocontext 24 | https://t.me/idelrealii 25 | https://t.me/mobilizationnews 26 | https://t.me/russian_trash_news 27 | https://t.me/horizontal_russia 28 | https://t.me/news_sirena 29 | https://t.me/dirtytatarstan 30 | https://t.me/nsnfm 31 | https://t.me/sledcom_press 32 | https://t.me/ne_zhdi_novosti 33 | https://t.me/brieflyru 34 | https://t.me/tvrain 35 | https://t.me/dagpravdaru 36 | https://t.me/YourNewsTalk 37 | https://t.me/ToBeOr_Official 38 | https://t.me/svobodnieslova 39 | https://t.me/currenttime 40 | https://t.me/milinfolive 41 | https://t.me/meduzalive 42 | https://t.me/opersvodki 43 | https://t.me/bazabazon 44 | https://t.me/playcivilization 45 | https://t.me/rusvesnasu 46 | https://t.me/readovkaru 47 | https://t.me/tradkz 48 | https://t.me/SputnikAtoNews 49 | https://t.me/shot_shot 50 | https://t.me/rbc_news 51 | https://t.me/moscow_laundry 52 | https://t.me/periskop_pacific 53 | https://t.me/regnum_na -------------------------------------------------------------------------------- /channelslists/proRuChannels.txt: -------------------------------------------------------------------------------- 1 | https://t.me/osetin20 2 | https://t.me/WarZoneInc 3 | https://t.me/Love_Russia_Beauty 4 | https://t.me/skurlatovlive 5 | https://t.me/donrf22 6 | https://t.me/brussinf 7 | https://t.me/economica_russia 8 | https://t.me/russ_orientalist 9 | https://t.me/kremlin_secrets 10 | https://t.me/mrfrostoviklive 11 | https://t.me/mortisaeterna 12 | https://t.me/sidpolit 13 | https://t.me/Kolomna_Gorod 14 | https://t.me/Ugolok_Sitha 15 | https://t.me/kolomna750 16 | https://t.me/dolg_z 17 | https://t.me/combat_hemp 18 | https://t.me/btvt2019 19 | https://t.me/ramzayiegokomanda 20 | https://t.me/philologist_zov 21 | https://t.me/rustroyka1945 22 | https://t.me/m0nstas 23 | https://t.me/osvedomitell_alex 24 | https://t.me/divannaya_brigada 25 | https://t.me/soldat_prav 26 | https://t.me/ZONA_P 27 | https://t.me/russkiy_opolchenec 28 | https://t.me/BKPROGRESSor 29 | https://t.me/SIL0VIKI 30 | https://t.me/motopatriot78 31 | https://t.me/donetchan 32 | https://t.me/Nackepelo 33 | https://t.me/TheDeadDistrict 34 | https://t.me/notes_veterans 35 | https://t.me/babaycalls 36 | https://t.me/tankistrossii100 37 | https://t.me/news_mvddnr 38 | https://t.me/z4lpr 39 | https://t.me/khornegroup 40 | https://t.me/Soldieroffortune777 41 | https://t.me/dosye_shpiona 42 | https://t.me/russkiegramoty 43 | https://t.me/norinea 44 | https://t.me/communitynumber5 45 | https://t.me/OmTVchannel 46 | https://t.me/vozhak_Z 47 | https://t.me/obshina_ru 48 | https://t.me/lost_generation_88 49 | https://t.me/itsdonetsk -------------------------------------------------------------------------------- /channelslists/proUkrChannels.txt: -------------------------------------------------------------------------------- 1 | https://t.me/slvn_pomet 2 | 3 | https://t.me/dontstopwar 4 | 5 | https://t.me/Operatyvnyi_Donbas 6 | 7 | https://t.me/ukrainian_militant 8 | 9 | https://t.me/batalionmonako 10 | 11 | https://t.me/karymat 12 | 13 | https://t.me/Ateobreaking 14 | 15 | https://t.me/BaluHUB777 16 | 17 | https://t.me/adept_ua 18 | 19 | https://t.me/Za_Derjavy 20 | 21 | https://t.me/odeskaODA 22 | 23 | https://t.me/OSHP_225 24 | 25 | https://t.me/sprava_groma 26 | 27 | https://t.me/ua_stalker 28 | 29 | https://t.me/hochu_zhyt 30 | 31 | https://t.me/warinmyeyes_chat 32 | 33 | https://t.me/braty_yakovlevu 34 | 35 | https://t.me/TyskNIP 36 | 37 | https://t.me/itarmyofukraine2022 38 | 39 | https://t.me/atodoneck 40 | 41 | https://t.me/dtek_ua 42 | 43 | https://t.me/atomiccherry 44 | 45 | https://t.me/Ukr_G_M 46 | 47 | https://t.me/lost_warinua 48 | 49 | https://t.me/khersonskaODA 50 | 51 | https://t.me/ua_hero -------------------------------------------------------------------------------- /getTelegram.py: -------------------------------------------------------------------------------- 1 | # Main entry point for the Telegram Scraper application. 2 | # import tkinter as tk # OLD 3 | import customtkinter as ctk # NEW 4 | from tkinter import messagebox # Keep messagebox from standard tkinter 5 | import os 6 | import sys 7 | from pathlib import Path # Use pathlib for easier path handling 8 | import tkinter as tk # Potrzebne dla root_err w bloku except 9 | 10 | 11 | # --- Determine Base Directory --- 12 | # (Keep this section as is) 13 | if getattr(sys, 'frozen', False): 14 | base_dir = Path(sys.executable).parent 15 | elif __file__: 16 | base_dir = Path(__file__).parent 17 | else: 18 | base_dir = Path.cwd() 19 | 20 | # --- Dynamically add project root and src to sys.path --- 21 | project_root = base_dir 22 | src_dir = project_root / 'src' 23 | if str(project_root) not in sys.path: 24 | sys.path.insert(0, str(project_root)) 25 | if str(src_dir) not in sys.path: 26 | sys.path.insert(0, str(src_dir)) 27 | 28 | # --- Set CustomTkinter Appearance --- 29 | ctk.set_appearance_mode("System") # Options: "System", "Light", "Dark" 30 | ctk.set_default_color_theme("blue") # Options: "blue", "green", "dark-blue" 31 | 32 | # --- Import GUI Component and Dependencies --- 33 | try: 34 | from gui.main_window import TelegramScraperGUI 35 | # (Keep other imports and the basic structure of the try/except block) 36 | from scraper import run_scraping 37 | from my_telegram_scrapper import SimpleScraperClient 38 | except ImportError as e: 39 | project_root = base_dir # <-- DODANO TĘ LINIĘ, aby naprawić NameError 40 | error_details = f"{e}\n\n" 41 | error_details += f"Could not import required components.\n" 42 | error_details += f"Please ensure 'src' and 'my_telegram_scrapper' directories exist relative to the executable or script:\n{project_root}\n" 43 | error_details += "Also, verify that all dependencies (including customtkinter) from requirements.txt are installed." 44 | print(f"Fatal Error: {error_details}") 45 | # Attempt to show a GUI error message (using standard tkinter temporarily if ctk fails) 46 | try: 47 | # Use a temporary standard Tk root for the error if ctk fails early 48 | root_err = tk.Tk() 49 | root_err.withdraw() 50 | messagebox.showerror("Startup Error", f"Failed to load application components.\n\n{error_details}") 51 | root_err.destroy() 52 | except Exception: # Catch broader exceptions here, including tk.TclError 53 | print("GUI error: Could not display the error message box.") 54 | sys.exit(1) 55 | except Exception as e: 56 | # Catch any other unexpected error during initial imports 57 | print(f"Fatal Error during startup: {e}") 58 | try: 59 | # Use a temporary standard Tk root for the error if ctk fails early 60 | root_err = tk.Tk() 61 | root_err.withdraw() 62 | messagebox.showerror("Startup Error", f"An unexpected error occurred during initialization:\n\n{e}") 63 | root_err.destroy() 64 | except Exception: 65 | pass # Console print is the fallback 66 | sys.exit(1) 67 | 68 | # --- Main Execution Function --- 69 | def main(): 70 | """Sets up and runs the CustomTkinter application.""" 71 | # root = tk.Tk() # OLD 72 | root = ctk.CTk() # NEW 73 | try: 74 | # Pass the base_dir (as a string or Path object) to the GUI 75 | app = TelegramScraperGUI(root, str(base_dir)) # Pass as string if GUI expects it 76 | root.minsize(600, 700) # Adjusted minsize slightly 77 | root.mainloop() 78 | except Exception as e: 79 | print(f"Fatal Error running the application: {e}") 80 | # Attempt to show error message if GUI fails during runtime 81 | try: 82 | # customtkinter windows might not have winfo_exists in the same way 83 | # Just try showing the error 84 | messagebox.showerror("Application Error", f"An unexpected error occurred while running:\n\n{e}") 85 | if root: # Check if root object exists 86 | root.destroy() 87 | except Exception: # Catch broader exceptions 88 | pass # Avoid errors if the window is already gone 89 | sys.exit(1) 90 | 91 | # --- Script Entry Point --- 92 | if __name__ == "__main__": 93 | main() 94 | -------------------------------------------------------------------------------- /my_telegram_scrapper/__init__.py: -------------------------------------------------------------------------------- 1 | # my_scraper/__init__.py 2 | from .client import SimpleScraperClient 3 | from .models import SimpleTgPost, ScrapedPage 4 | # You can add parser functions or model classes here if you want direct access 5 | # e.g., from my_scraper import SimpleScraperClient -------------------------------------------------------------------------------- /my_telegram_scrapper/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.exceptions import RequestException, ConnectionError, Timeout 3 | from typing import Optional, Dict 4 | 5 | from .parser import parse_page 6 | from .models import ScrapedPage 7 | 8 | class SimpleScraperClient: 9 | """ 10 | A simple client to fetch and parse Telegram channel web preview pages. 11 | """ 12 | BASE_URL: str = "https://t.me" 13 | DEFAULT_USER_AGENT: str = ( 14 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' 15 | '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' 16 | ) 17 | REQUEST_TIMEOUT: int = 15 # Slightly longer timeout 18 | 19 | def __init__(self, headers: Optional[Dict[str, str]] = None): 20 | """ 21 | Initializes the requests session with default or provided headers. 22 | """ 23 | self.session = requests.Session() 24 | # Set default headers to mimic a browser 25 | default_headers = { 26 | 'User-Agent': self.DEFAULT_USER_AGENT, 27 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 28 | 'Accept-Language': 'en-US,en;q=0.9', 29 | } 30 | # Update with provided headers, overriding defaults if necessary 31 | self.session.headers.update(headers or default_headers) 32 | 33 | def get_channel_page(self, channel_username: str, before_token: Optional[str] = None) -> Optional[ScrapedPage]: 34 | """ 35 | Fetches and parses a single page of posts from a channel's web view. 36 | 37 | Args: 38 | channel_username: The username of the target channel (without '@'). 39 | before_token: The token/ID to fetch posts before this point (for pagination). 40 | 41 | Returns: 42 | A ScrapedPage object containing posts and next page token, or None on error. 43 | """ 44 | url = f"{self.BASE_URL}/s/{channel_username}" 45 | params = {} 46 | if before_token: 47 | params['before'] = before_token 48 | 49 | try: 50 | response = self.session.get(url, params=params, timeout=self.REQUEST_TIMEOUT) 51 | response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) 52 | 53 | # Check explicit status code, though raise_for_status covers most cases 54 | if response.status_code == 200: 55 | # Parse the HTML content 56 | return parse_page(response.text) 57 | else: 58 | # This case is less likely if raise_for_status() is used, but kept for safety 59 | print(f"Error: Received unexpected status code {response.status_code} for {url}") 60 | return None 61 | 62 | except Timeout: 63 | print(f"Error: Request timed out for {url}") 64 | return None 65 | except ConnectionError: 66 | print(f"Error: Could not connect to {url}. Check network connection.") 67 | return None 68 | except RequestException as e: 69 | # Catches other requests-related errors (like HTTPError from raise_for_status) 70 | print(f"Error fetching {url}: {e}") 71 | return None 72 | except Exception as e: 73 | # Catch potential errors during parsing (though should be handled in parser ideally) 74 | print(f"An unexpected error occurred processing channel '{channel_username}': {e}") 75 | return None 76 | 77 | def close(self): 78 | """Closes the underlying requests session.""" 79 | if self.session: 80 | self.session.close() 81 | print("Requests session closed.") # Optional: confirmation 82 | 83 | # Context manager support 84 | def __enter__(self): 85 | return self 86 | 87 | def __exit__(self, exc_type, exc_val, exc_tb): 88 | """Ensures the session is closed when exiting a 'with' block.""" 89 | self.close() -------------------------------------------------------------------------------- /my_telegram_scrapper/models.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from datetime import datetime 3 | from typing import Optional, List 4 | 5 | @dataclass 6 | class SimpleTgAuthor: 7 | """Represents basic information about a post author.""" 8 | username: Optional[str] = None 9 | display_name: Optional[str] = None 10 | profile_url: Optional[str] = None 11 | 12 | @dataclass 13 | class SimpleTgPost: 14 | """Represents basic information about a scraped Telegram post.""" 15 | post_id: Optional[int] = None 16 | post_url: Optional[str] = None 17 | content: Optional[str] = None 18 | timestamp: Optional[datetime] = None 19 | views: Optional[str] = None # e.g., '1.8K', kept as string for simplicity 20 | # Use field to provide a default_factory for mutable types like classes 21 | author: SimpleTgAuthor = field(default_factory=SimpleTgAuthor) 22 | # Add other fields as needed (e.g., media URLs) 23 | # image_urls: List[str] = field(default_factory=list) 24 | # video_urls: List[str] = field(default_factory=list) 25 | 26 | @dataclass 27 | class ScrapedPage: 28 | """Represents the results from scraping one page of a channel.""" 29 | posts: List[SimpleTgPost] = field(default_factory=list) 30 | next_page_token: Optional[str] = None # e.g., the 'before' ID for the next request 31 | # channel_name: Optional[str] = None # Could add channel info here if needed -------------------------------------------------------------------------------- /my_telegram_scrapper/parser.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup, Tag 2 | from datetime import datetime 3 | import re 4 | from typing import List, Optional 5 | 6 | # Import the dataclass models 7 | from .models import SimpleTgPost, SimpleTgAuthor, ScrapedPage 8 | 9 | TELEGRAM_BASE_URL: str = "https://t.me" 10 | 11 | def _parse_post_id_from_url(url: Optional[str]) -> Optional[int]: 12 | """Extracts the post ID (integer) from a Telegram post URL.""" 13 | if not url: 14 | return None 15 | # Regex: find digits preceded by '/' and followed by '?' or end of string 16 | match = re.search(r'/(\d+)(?:\?|$)', url) 17 | return int(match.group(1)) if match else None 18 | 19 | def _parse_username_from_url(url: Optional[str]) -> Optional[str]: 20 | """Extracts the username from a Telegram profile or channel URL.""" 21 | if not url: 22 | return None 23 | # Regex: find text after the last '/' but before '?' or end of string 24 | # Handles /s/channel, /channel, etc. 25 | match = re.search(r'/([^/?]+)$', url) 26 | # Alternative if above is too greedy: r'/s?/([^/?]+)' 27 | return match.group(1) if match else None 28 | 29 | def _safe_find_text(element: Optional[Tag], selector: str, strip: bool = True) -> Optional[str]: 30 | """Safely finds an element using a CSS selector and returns its stripped text.""" 31 | if not element: 32 | return None 33 | found = element.select_one(selector) # Use CSS selector 34 | return found.get_text(strip=strip) if found else None 35 | 36 | def _safe_get_attr(element: Optional[Tag], selector: str, attribute: str) -> Optional[str]: 37 | """Safely finds an element using a CSS selector and returns a specific attribute.""" 38 | if not element: 39 | return None 40 | found = element.select_one(selector) 41 | return found.get(attribute) if found else None 42 | 43 | def parse_single_post(post_element: Tag) -> Optional[SimpleTgPost]: 44 | """Parses a single post HTML element (div.tgme_widget_message_wrap) into a SimpleTgPost object.""" 45 | if not isinstance(post_element, Tag): 46 | return None 47 | 48 | post = SimpleTgPost() # Initialize with defaults from dataclass 49 | 50 | # Main message container is crucial 51 | widget_message = post_element.select_one(".tgme_widget_message") 52 | if not widget_message: 53 | print("Warning: Could not find main message container ('.tgme_widget_message') in post element.") 54 | return None # Cannot proceed without this 55 | 56 | # --- Basic Post Info --- 57 | data_post_url = widget_message.get('data-post-url') 58 | data_post = widget_message.get('data-post') # e.g., channel/12345 59 | if data_post_url: 60 | post.post_url = data_post_url 61 | elif data_post: 62 | post.post_url = f"{TELEGRAM_BASE_URL}/{data_post}" 63 | post.post_id = _parse_post_id_from_url(post.post_url) 64 | 65 | # --- Author Info --- 66 | # Look for the primary author name structure first 67 | author_link_tag = widget_message.select_one(".tgme_widget_message_owner_name a") 68 | if author_link_tag: 69 | post.author.profile_url = author_link_tag.get('href') 70 | post.author.username = _parse_username_from_url(post.author.profile_url) 71 | # Get text directly from the link's span if available 72 | author_name_span = author_link_tag.select_one("span") # or "span.name" if specific 73 | post.author.display_name = author_name_span.get_text(strip=True) if author_name_span else author_link_tag.get_text(strip=True) 74 | else: 75 | # Fallback for potentially different structures (e.g., forwarded messages might differ) 76 | author_user_tag = widget_message.select_one(".tgme_widget_message_from_author") # Check for forwarded author 77 | if author_user_tag: 78 | post.author.display_name = author_user_tag.get_text(strip=True) 79 | # Profile URL/username might not be available for forwarded authors in preview 80 | 81 | # --- Content --- 82 | # Select the text element, handling potential variations 83 | text_element = widget_message.select_one(".tgme_widget_message_text") 84 | if text_element: 85 | # Use separator='\n' to preserve line breaks within the post text 86 | post.content = text_element.get_text(separator='\n', strip=True) 87 | else: 88 | # Sometimes content is directly in the message bubble without a specific text class 89 | # This is less reliable and might grab unwanted text like "Forwarded message" 90 | # fallback_text = widget_message.select_one(".tgme_widget_message_bubble > .tgme_widget_message_text") # Example 91 | post.content = None # Or try a broader fallback if needed 92 | 93 | # --- Timestamp --- 94 | time_tag = widget_message.select_one(".tgme_widget_message_date time") 95 | if time_tag and time_tag.get('datetime'): 96 | try: 97 | # Attempt to parse ISO format timestamp (e.g., 2023-10-27T10:30:00+00:00) 98 | post.timestamp = datetime.fromisoformat(time_tag['datetime']) 99 | except ValueError: 100 | print(f"Warning: Could not parse timestamp datetime: {time_tag.get('datetime')}") 101 | post.timestamp = None # Handle parsing errors gracefully 102 | 103 | # --- Views --- 104 | # Views might be inside the date container or separate 105 | post.views = _safe_find_text(widget_message, ".tgme_widget_message_views") 106 | 107 | # --- Placeholder: Add parsing for media (images, videos) if needed --- 108 | # Example (very basic background image style): 109 | # photo_wrap = widget_message.select_one(".tgme_widget_message_photo_wrap[style*='background-image']") 110 | # if photo_wrap: 111 | # style = photo_wrap.get('style', '') 112 | # match = re.search(r"background-image:url\('(.*?)'\)", style) 113 | # if match: 114 | # # post.image_urls.append(match.group(1)) # Assuming image_urls list exists 115 | # pass 116 | 117 | return post 118 | 119 | def parse_page(html_content: str) -> ScrapedPage: 120 | """Parses the HTML content of a Telegram channel's web preview page.""" 121 | soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser 122 | page_result = ScrapedPage() # Initialize dataclass 123 | 124 | # Find all post container elements (usually divs with this class) 125 | post_elements = soup.select(".tgme_widget_message_wrap") # Use CSS selector 126 | 127 | if not post_elements: 128 | print("Warning: No post elements found with selector '.tgme_widget_message_wrap'. Page structure might have changed.") 129 | 130 | for element in post_elements: 131 | parsed_post = parse_single_post(element) 132 | if parsed_post: 133 | page_result.posts.append(parsed_post) 134 | 135 | # Find the token/ID for the *next* page (link to load *older* posts) 136 | # The 'Load more' link usually contains '?before=...' 137 | load_more_link = soup.select_one('a.tme_messages_more[href*="?before="]') 138 | if load_more_link: 139 | href = load_more_link.get('href', '') 140 | # Extract the 'before' parameter value 141 | match = re.search(r'[?&]before=(\d+)', href) 142 | if match: 143 | page_result.next_page_token = match.group(1) 144 | else: 145 | print("Warning: Found 'Load More' link but could not extract 'before' token.") 146 | 147 | # --- Placeholder: Add parsing for channel info (title, description, etc.) if needed --- 148 | # channel_info_header = soup.select_one(".tgme_channel_info_header_title") 149 | # if channel_info_header: 150 | # page_result.channel_name = channel_info_header.get_text(strip=True) 151 | # pass 152 | 153 | return page_result -------------------------------------------------------------------------------- /requirements.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo Installing required Python packages from requirements.txt... 3 | pip install -r requirements.txt 4 | echo. 5 | echo Installation attempt finished. Check above for any errors. 6 | pause 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | beautifulsoup4 3 | lxml 4 | customtkinter -------------------------------------------------------------------------------- /run.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo Running getTelegram.py... 3 | python getTelegram.py 4 | echo. 5 | echo Program finished or closed. 6 | pause 7 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # TelegramOSINTPolo-main/src/config.py 2 | from datetime import date 3 | 4 | # --- CONSTANTS --- 5 | ARCHIVE_DIR_NAME: str = "archive" 6 | 7 | # Stop scraping posts older than this date (inclusive) 8 | # Ensures we don't go back indefinitely. 9 | CUTOFF_DATE: date = date(2022, 1, 1) 10 | 11 | # Potentially add other configurations here if needed 12 | # DEFAULT_OUTPUT_FILENAME_FORMAT: str = "output_{list_name}_{date}.txt" 13 | # LOG_LEVEL: str = "INFO" -------------------------------------------------------------------------------- /src/gui/__init__.py: -------------------------------------------------------------------------------- 1 | # src/gui/__init__.py 2 | # This file makes 'gui' a Python package. 3 | from .main_window import TelegramScraperGUI -------------------------------------------------------------------------------- /src/gui/event_handlers.py: -------------------------------------------------------------------------------- 1 | # import tkinter as tk # OLD 2 | import customtkinter as ctk # NEW 3 | import tkinter as tk # Keep standard tk for messagebox, TclError, maybe Spinbox type check 4 | from tkinter import filedialog, messagebox, ttk # Keep ttk for Spinbox type check 5 | import threading 6 | import queue 7 | from datetime import date, timedelta, datetime 8 | import calendar 9 | import os 10 | from pathlib import Path 11 | from typing import Optional, Callable, Tuple, Any # Added Any 12 | 13 | from src.config import CUTOFF_DATE 14 | try: 15 | from src.scraper import run_scraping 16 | except ImportError: 17 | messagebox.showerror("Import Error", "Could not load the core scraping module ('src.scraper'). Please check installation.") 18 | import sys 19 | sys.exit(1) 20 | 21 | class GuiEventHandlers: 22 | """Contains event handling methods for the TelegramScraperGUI.""" 23 | 24 | def __init__(self, app_instance): 25 | self.app = app_instance 26 | 27 | # --- Log Handling --- 28 | def log_message(self, message: str, level: str = "INFO"): 29 | # (Keep this method as is) 30 | level = level.upper() 31 | if level not in ["DEBUG", "INFO", "WARN", "ERROR"]: 32 | level = "INFO" 33 | timestamp = datetime.now().strftime("%H:%M:%S") 34 | formatted_message = f"[{timestamp}][{level}] {message}" 35 | try: 36 | self.app.log_queue.put(formatted_message) 37 | except AttributeError: 38 | print(f"Fallback Log: {formatted_message}") 39 | 40 | def process_log_queue(self): 41 | """Processes messages from the log queue and updates the GUI log text (CTkTextbox).""" 42 | try: 43 | while not self.app.log_queue.empty(): 44 | full_message = self.app.log_queue.get_nowait() 45 | tag = "" 46 | # Determine tag based on level prefix (same logic) 47 | if "[ERROR]" in full_message: tag = "ERROR" 48 | elif "[WARN]" in full_message: tag = "WARN" 49 | elif "[INFO]" in full_message: tag = "INFO" 50 | elif "[DEBUG]" in full_message: tag = "DEBUG" 51 | 52 | # Check if master window and log_text widget exist 53 | # Use ctk checks if available, otherwise standard tkinter checks 54 | if self.app.master and hasattr(self.app, 'log_text') and self.app.log_text: 55 | # CTkTextbox needs state change to insert 56 | self.app.log_text.configure(state="normal") 57 | if tag: 58 | self.app.log_text.insert(ctk.END, full_message + '\n', (tag,)) 59 | else: 60 | self.app.log_text.insert(ctk.END, full_message + '\n') 61 | self.app.log_text.see(ctk.END) # Scroll to the end 62 | self.app.log_text.configure(state="disabled") # Disable editing again 63 | except queue.Empty: 64 | pass 65 | except Exception as e: 66 | print(f"Error processing log queue: {e}") # Fallback print 67 | finally: 68 | # Reschedule check only if the master window still exists (basic check) 69 | if self.app.master: 70 | self.app.master.after(100, self.process_log_queue) 71 | 72 | 73 | # --- File Dialog (No longer used for channel list) --- 74 | def open_file_dialog(self): 75 | """Opens a dialog to select the channel list file. (DEPRECATED)""" 76 | self.log_message("Browse button clicked (feature deprecated, use dropdown).", "WARN") 77 | messagebox.showinfo("Info", "Channel list selection is now done via the dropdown menu.") 78 | # Keep original logic commented out or remove if preferred 79 | # initial_dir_path = Path(self.app.base_dir) / "channelslists" 80 | # if not initial_dir_path.is_dir(): 81 | # initial_dir_path = Path(self.app.base_dir) 82 | # filename = filedialog.askopenfilename(...) 83 | # if filename: self.app.channellist_path.set(filename) ... 84 | 85 | 86 | # --- Date Validation --- 87 | def validate_date_spinbox(self, prefix: str): 88 | """Adjusts the maximum day for the selected month and year spinboxes.""" 89 | # (This logic interacts with tk.IntVar and ttk.Spinbox, should remain compatible) 90 | try: 91 | if prefix == 'sel': 92 | year_var, month_var, day_var = self.app.sel_year, self.app.sel_month, self.app.sel_day 93 | day_spinbox = self.app.day_spinbox 94 | elif prefix == 'start': 95 | year_var, month_var, day_var = self.app.start_year, self.app.start_month, self.app.start_day 96 | day_spinbox = self.app.start_day_spinbox 97 | elif prefix == 'end': 98 | year_var, month_var, day_var = self.app.end_year, self.app.end_month, self.app.end_day 99 | day_spinbox = self.app.end_day_spinbox 100 | else: 101 | self.log_message(f"Invalid prefix '{prefix}' for date validation.", "WARN") 102 | return 103 | 104 | year = year_var.get() 105 | month = month_var.get() 106 | 107 | if 1 <= month <= 12: 108 | _, days_in_month = calendar.monthrange(year, month) 109 | # Configure spinbox only if it exists (using standard tkinter methods) 110 | if day_spinbox and getattr(day_spinbox, 'winfo_exists', lambda: False)(): # Safer check 111 | day_spinbox.config(to=days_in_month) 112 | if day_var.get() > days_in_month: 113 | day_var.set(days_in_month) 114 | except ValueError: pass 115 | except (tk.TclError, AttributeError): pass 116 | except Exception as e: 117 | self.log_message(f"Error validating date spinbox ({prefix}): {e}", "ERROR") 118 | 119 | # --- Date Parsing Helpers --- 120 | def _parse_date_or_show_error(self, year_var, month_var, day_var, date_description: str) -> Optional[date]: 121 | # (This logic uses tk.IntVar.get(), should remain compatible) 122 | # (Keep this method as is) 123 | try: 124 | year_val, month_val, day_val = year_var.get(), month_var.get(), day_var.get() 125 | # Explicitly convert to integers 126 | parsed_date = date(int(year_val), int(month_val), int(day_val)) 127 | if parsed_date > date.today(): 128 | messagebox.showwarning("Invalid Date", f"The selected {date_description} date ({parsed_date.strftime('%Y-%m-%d')}) cannot be in the future.") 129 | return None 130 | if parsed_date < CUTOFF_DATE: 131 | messagebox.showwarning("Invalid Date", f"The selected {date_description} date ({parsed_date.strftime('%Y-%m-%d')}) must be on or after {CUTOFF_DATE.strftime('%Y-%m-%d')}.") 132 | return None 133 | return parsed_date 134 | except ValueError: 135 | messagebox.showerror("Invalid Date", f"The selected {date_description} date is invalid. Please check the year, month, and day.") 136 | return None 137 | 138 | def _get_dates_for_mode(self, mode: str) -> Optional[Tuple[Optional[date], Optional[date], Optional[date]]]: 139 | # (This logic uses _parse_date_or_show_error, keep as is) 140 | target_date_obj: Optional[date] = None 141 | start_date_obj: Optional[date] = None 142 | end_date_obj: Optional[date] = None 143 | 144 | if mode == 'today': target_date_obj = date.today() 145 | elif mode == 'yesterday': target_date_obj = date.today() - timedelta(days=1) 146 | elif mode == 'specific_date': 147 | target_date_obj = self._parse_date_or_show_error(self.app.sel_year, self.app.sel_month, self.app.sel_day, "specific") 148 | if target_date_obj is None: return None 149 | elif mode == 'date_range': 150 | start_date_obj = self._parse_date_or_show_error(self.app.start_year, self.app.start_month, self.app.start_day, "start") 151 | if start_date_obj is None: return None 152 | end_date_obj = self._parse_date_or_show_error(self.app.end_year, self.app.end_month, self.app.end_day, "end") 153 | if end_date_obj is None: return None 154 | if start_date_obj > end_date_obj: 155 | messagebox.showwarning("Invalid Date Range", "The 'Start Date' cannot be later than the 'End Date'.") 156 | return None 157 | elif mode == 'all': pass 158 | return target_date_obj, start_date_obj, end_date_obj 159 | 160 | 161 | # --- Scraping Control --- 162 | def start_scraping_base(self, mode: str): 163 | """Base function to validate inputs and initiate scraping for any mode.""" 164 | if self.app.scraping_thread and self.app.scraping_thread.is_alive(): 165 | messagebox.showwarning("Process Running", "A scraping process is already active.") 166 | return 167 | 168 | # === NEW: Validate Channel List File from Dropdown === 169 | selected_filename = self.app.channellist_path.get().strip() 170 | if not selected_filename or selected_filename in ["No lists found", "Error reading lists", "Error scanning lists"]: 171 | messagebox.showwarning("Missing Input", "Please select a valid channel list from the dropdown.\nEnsure the 'channelslists' folder exists and contains .txt files.") 172 | return 173 | 174 | # Construct the full path 175 | channelslists_dir = Path(self.app.base_dir) / "channelslists" 176 | channellist_file = str(channelslists_dir / selected_filename) # Convert Path to string for os.path.exists 177 | 178 | if not os.path.exists(channellist_file): # Check full path existence 179 | messagebox.showerror("File Error", f"The selected channel list file does not seem to exist:\n{channellist_file}") 180 | # Consider re-populating the dropdown here if the file vanished 181 | # self.app._populate_channel_list_dropdown() 182 | return 183 | # === END NEW === 184 | 185 | # Get and Validate Dates (No changes needed here) 186 | date_info = self._get_dates_for_mode(mode) 187 | if date_info is None: return 188 | target_date_obj, start_date_obj, end_date_obj = date_info 189 | 190 | # --- Start Scraping Thread --- 191 | self.app.stop_event.clear() 192 | 193 | # Clear log area (using CTkTextbox configure) 194 | if hasattr(self.app, 'log_text') and self.app.log_text: 195 | self.app.log_text.configure(state="normal") 196 | self.app.log_text.delete('1.0', ctk.END) 197 | self.app.log_text.configure(state="disabled") 198 | 199 | self.log_message(f"Initiating scraping process (Mode: '{mode}', List: '{selected_filename}')...", "INFO") 200 | self.disable_action_buttons() 201 | 202 | self.app.scraping_thread = threading.Thread( 203 | target=self.scrape_in_thread, 204 | # Pass the FULL PATH to the scrape function 205 | args=(channellist_file, mode, target_date_obj, start_date_obj, end_date_obj), 206 | daemon=True 207 | ) 208 | self.app.scraping_thread.start() 209 | 210 | def stop_scraping(self): 211 | # (Keep this method as is, stop_event is independent of UI lib) 212 | if self.app.scraping_thread and self.app.scraping_thread.is_alive(): 213 | self.log_message("Stop signal sent to scraping thread.", "WARN") 214 | self.app.stop_event.set() 215 | if hasattr(self.app, 'stop_button') and self.app.stop_button: 216 | self.app.stop_button.configure(state="disabled") # Use configure for ctk 217 | else: 218 | self.log_message("No active scraping process to stop.", "INFO") 219 | 220 | def scrape_in_thread(self, channellist_file, mode, target_date, start_date, end_date): 221 | # (Keep this method's core logic - it calls run_scraping) 222 | # (run_scraping is UI independent, just uses callbacks) 223 | output_files = [] 224 | error_occurred = False 225 | final_message = "An unknown error occurred." 226 | final_message_type = "ERROR" 227 | 228 | try: 229 | output_files = run_scraping( 230 | channellist_file=channellist_file, # Pass the full path 231 | mode=mode, 232 | target_date=target_date, 233 | start_date=start_date, 234 | end_date=end_date, 235 | log_callback=self.log_message, 236 | stop_event=self.app.stop_event, 237 | base_dir=self.app.base_dir 238 | ) 239 | 240 | if self.app.stop_event.is_set(): 241 | final_message = "Scraping process was interrupted by the user." 242 | final_message_type = "WARN" 243 | elif not output_files: 244 | final_date_to_show = target_date if mode != 'date_range' else end_date 245 | start_date_for_msg = start_date if mode == 'date_range' else None 246 | final_message = self._generate_no_posts_message(mode, final_date_to_show, start_date_for_msg) 247 | final_message_type = "INFO" 248 | else: 249 | files_str = "\n".join([os.path.basename(f) for f in output_files]) 250 | final_message = f"Scraping completed successfully.\nCreated/updated files:\n{files_str}" 251 | final_message_type = "SUCCESS" 252 | 253 | except ImportError as e: 254 | error_occurred = True 255 | final_message = f"Import Error: {e}\nCannot run scraping. Check installation and file structure." 256 | self.log_message(final_message, "ERROR") 257 | except (FileNotFoundError, ValueError, RuntimeError, NameError) as e: 258 | error_occurred = True 259 | final_message = f"Scraping failed: {e}" 260 | except Exception as e: 261 | error_occurred = True 262 | final_message = f"An unexpected critical error occurred: {type(e).__name__} - {e}" 263 | self.log_message(final_message, "ERROR") 264 | 265 | finally: 266 | # --- Schedule GUI updates back on the main thread --- 267 | if self.app.master: # Basic check if master exists 268 | self.app.master.after(0, self.show_final_message, final_message, final_message_type, error_occurred) 269 | self.app.master.after(0, self.reset_buttons) 270 | 271 | def _generate_no_posts_message(self, mode: str, target_date: Optional[date], start_date: Optional[date]) -> str: 272 | # (Keep this method as is) 273 | date_info = "" 274 | cutoff_str = f" (after {CUTOFF_DATE.strftime('%Y-%m-%d')})" 275 | if mode == 'date_range' and start_date and target_date: date_info = f" for range {start_date.strftime('%Y-%m-%d')} to {target_date.strftime('%Y-%m-%d')}" 276 | elif target_date and mode != 'all': date_info = f" for {target_date.strftime('%Y-%m-%d')}" 277 | elif mode == 'all': date_info = " in 'all' mode" 278 | return f"No posts matching the criteria were found{date_info}{cutoff_str}." 279 | 280 | # --- GUI Message Functions (run in main thread via master.after) --- 281 | def show_final_message(self, message: str, message_type: str, error_occurred: bool): 282 | # (Keep this method as is, uses standard messagebox) 283 | try: 284 | if self.app.master: 285 | if message_type == "SUCCESS": messagebox.showinfo("Success!", message) 286 | elif message_type == "INFO": messagebox.showinfo("No Results", message) 287 | elif message_type == "WARN": messagebox.showwarning("Interrupted", message) 288 | else: # ERROR 289 | full_error_msg = f"{message}\n\nPlease check the logs for more details." 290 | messagebox.showerror("Error", full_error_msg) 291 | except tk.TclError: pass # Window might have been closed 292 | 293 | # --- Button State Management --- 294 | def _set_button_state(self, button_name: str, state: str): # Use string state for ctk 295 | """Safely sets the state of a button widget attribute on the app instance.""" 296 | button_widget = getattr(self.app, button_name, None) 297 | # Check for CTkButton, fallback to tk.Button/ttk.Button might be needed if mixing 298 | # Also check if Spinboxes are controlled here - they use standard tk state 299 | if isinstance(button_widget, (ctk.CTkButton)): 300 | # Check if widget is destroyed - basic check if object exists 301 | if button_widget: 302 | try: 303 | button_widget.configure(state=state) # Use configure for ctk 304 | except Exception as e: # Catch broad exceptions 305 | self.log_message(f"Could not configure button '{button_name}': {e}", "WARN") 306 | pass 307 | elif isinstance(button_widget, (ttk.Spinbox)): # Handle spinboxes if needed 308 | if button_widget and getattr(button_widget, 'winfo_exists', lambda: False)(): 309 | try: 310 | # Spinbox uses standard tk state constants 311 | tk_state = tk.NORMAL if state == "normal" else tk.DISABLED 312 | button_widget.config(state=tk_state) 313 | except (tk.TclError, AttributeError): pass 314 | 315 | 316 | def disable_action_buttons(self): 317 | """Disables all action buttons and enables the stop button.""" 318 | if not self.app.master: return 319 | buttons_to_disable = [ 320 | 'specific_date_button', 'range_date_button', 'today_button', 321 | 'yesterday_button', 'all_button', 322 | 'channel_list_dropdown' # Disable dropdown during run 323 | # 'browse_button' # Removed 324 | ] 325 | # Also disable spinboxes 326 | spinboxes_to_disable = [ 327 | 'day_spinbox', 'month_spinbox', 'year_spinbox', 328 | 'start_day_spinbox', 'start_month_spinbox', 'start_year_spinbox', 329 | 'end_day_spinbox', 'end_month_spinbox', 'end_year_spinbox' 330 | ] 331 | for btn_name in buttons_to_disable + spinboxes_to_disable: 332 | self._set_button_state(btn_name, "disabled") # Use string state "disabled" 333 | self._set_button_state('stop_button', "normal") # Use string state "normal" 334 | 335 | def reset_buttons(self): 336 | """Resets button states after scraping finishes or is stopped.""" 337 | if not self.app.master: return 338 | buttons_to_enable = [ 339 | 'specific_date_button', 'range_date_button', 'today_button', 340 | 'yesterday_button', 'all_button', 341 | 'channel_list_dropdown' # Re-enable dropdown 342 | # 'browse_button' # Removed 343 | ] 344 | # Also enable spinboxes 345 | spinboxes_to_enable = [ 346 | 'day_spinbox', 'month_spinbox', 'year_spinbox', 347 | 'start_day_spinbox', 'start_month_spinbox', 'start_year_spinbox', 348 | 'end_day_spinbox', 'end_month_spinbox', 'end_year_spinbox' 349 | ] 350 | for btn_name in buttons_to_enable + spinboxes_to_enable: 351 | self._set_button_state(btn_name, "normal") 352 | self._set_button_state('stop_button', "disabled") 353 | 354 | 355 | # --- Window Closing Handler --- 356 | def on_closing(self): 357 | # (Keep logic, check master existence simply) 358 | if self.app.scraping_thread and self.app.scraping_thread.is_alive(): 359 | if messagebox.askyesno("Confirm Exit", "Scraping is still in progress.\nDo you want to stop the process and exit?"): 360 | self.log_message("Exit requested during active scraping. Sending stop signal...", "WARN") 361 | self.app.stop_event.set() 362 | # Use destroy directly after a short delay 363 | self.app.master.after(200, self.app.master.destroy) 364 | else: 365 | return # Do not close 366 | else: 367 | self.log_message("Application closing.", "INFO") 368 | if self.app.master: # Check before destroying 369 | self.app.master.destroy() -------------------------------------------------------------------------------- /src/gui/main_window.py: -------------------------------------------------------------------------------- 1 | import customtkinter as ctk 2 | from tkinter import messagebox 3 | import queue 4 | import threading 5 | from datetime import date 6 | from typing import Optional, List 7 | from pathlib import Path 8 | import os 9 | 10 | # Import UI component creation functions and event handlers 11 | from .ui_components import ( 12 | create_file_selection_ui, 13 | create_specific_date_picker_ui, 14 | create_date_range_picker_ui, 15 | create_action_buttons_ui, 16 | create_log_ui, 17 | # Import the analysis info UI function separately 18 | create_analysis_info_ui 19 | ) 20 | from .event_handlers import GuiEventHandlers 21 | from src.config import CUTOFF_DATE # Ensure CUTOFF_DATE is imported if used here, though likely not directly 22 | 23 | class TelegramScraperGUI: 24 | """ 25 | Main class for the Telegram Scraper GUI application using CustomTkinter. 26 | Orchestrates UI setup and event handling with a grid layout. 27 | """ 28 | def __init__(self, master: ctk.CTk, base_dir: str): 29 | """ 30 | Initializes the main GUI window. 31 | 32 | Args: 33 | master: The root CustomTkinter window (ctk.CTk instance). 34 | base_dir: The base directory path (string) for file operations. 35 | """ 36 | self.master: ctk.CTk = master 37 | self.base_dir: str = base_dir 38 | self.master.title("Telegram Post Downloader v3.2 (Grid Layout)") # Updated version/title 39 | self.master.geometry("850x750") # Adjusted size for sidebar 40 | 41 | # --- Configure root window's grid --- 42 | # Column 0 (main content) will expand, Column 1 (sidebar) fixed width 43 | self.master.grid_columnconfigure(0, weight=1) 44 | self.master.grid_columnconfigure(1, weight=0) # Sidebar doesn't expand horizontally 45 | # Row 0 will contain everything and expand vertically 46 | self.master.grid_rowconfigure(0, weight=1) 47 | 48 | # --- CustomTkinter Variables --- 49 | self.channellist_path = ctk.StringVar() # Will store the selected *filename* 50 | 51 | # Date Picker Variables (initialize with today's date) 52 | today = date.today() 53 | self.sel_year = ctk.IntVar(value=today.year) 54 | self.sel_month = ctk.IntVar(value=today.month) 55 | self.sel_day = ctk.IntVar(value=today.day) 56 | self.start_year = ctk.IntVar(value=today.year) 57 | self.start_month = ctk.IntVar(value=today.month) 58 | self.start_day = ctk.IntVar(value=1) # Default start day to 1st 59 | self.end_year = ctk.IntVar(value=today.year) 60 | self.end_month = ctk.IntVar(value=today.month) 61 | self.end_day = ctk.IntVar(value=today.day) # Default end day to today 62 | 63 | # --- Threading and Logging --- 64 | self.log_queue: queue.Queue[str] = queue.Queue() 65 | self.stop_event = threading.Event() 66 | self.scraping_thread: Optional[threading.Thread] = None 67 | 68 | # --- Initialize Event Handlers --- 69 | self.handlers = GuiEventHandlers(self) 70 | 71 | # --- Create Main Frames using grid --- 72 | # Main content frame on the left 73 | self.main_content_frame = ctk.CTkFrame(master, corner_radius=0, fg_color="transparent") 74 | # Place in grid cell (0,0), make it stick to all sides (nsew) 75 | self.main_content_frame.grid(row=0, column=0, padx=(10, 5), pady=10, sticky="nsew") 76 | # Configure internal row for log frame (assuming 5 main widgets packed above it) 77 | self.main_content_frame.grid_rowconfigure(5, weight=1) # Allow log frame (index 5 if 5 packed above) to expand 78 | 79 | # Sidebar frame on the right 80 | self.sidebar_frame = ctk.CTkFrame(master, width=200, corner_radius=0) # Keep defined width 81 | # Place in grid cell (0,1), make it stick vertically (ns) 82 | self.sidebar_frame.grid(row=0, column=1, padx=(5, 10), pady=10, sticky="ns") 83 | # Prevent sidebar from shrinking to content 84 | self.sidebar_frame.grid_propagate(False) 85 | 86 | 87 | # --- Create UI Sections (using pack inside their respective frames) --- 88 | # Widgets packed into main_content_frame 89 | create_file_selection_ui(self.main_content_frame, self) 90 | create_specific_date_picker_ui(self.main_content_frame, self) 91 | create_date_range_picker_ui(self.main_content_frame, self) 92 | create_action_buttons_ui(self.main_content_frame, self) 93 | create_log_ui(self.main_content_frame, self) # This creates self.log_text 94 | 95 | # Widgets packed into sidebar_frame 96 | create_analysis_info_ui(self.sidebar_frame, self) 97 | 98 | # --- Populate Channel List Dropdown --- 99 | self._populate_channel_list_dropdown() 100 | 101 | # --- Initialize and Start Log Processing Loop --- 102 | self.process_log_queue() 103 | 104 | # --- Initial Validation for Date Pickers --- 105 | self.validate_date_spinbox('sel') 106 | self.validate_date_spinbox('start') 107 | self.validate_date_spinbox('end') 108 | 109 | # --- Window Close Protocol --- 110 | self.master.protocol("WM_DELETE_WINDOW", self.on_closing) 111 | 112 | # Log application start 113 | self.log_message("Application initialized with grid layout.", "INFO") 114 | 115 | def _populate_channel_list_dropdown(self): 116 | """Finds .txt files in 'channelslists' and populates the dropdown.""" 117 | channelslists_dir = Path(self.base_dir) / "channelslists" 118 | channel_files: List[str] = [] 119 | default_selection = "No lists found" 120 | 121 | # Ensure dropdown widget exists before trying to configure it 122 | if not hasattr(self, 'channel_list_dropdown') or not self.channel_list_dropdown: 123 | self.log_message("Channel list dropdown widget not yet created.", "ERROR") 124 | self.channellist_path.set(default_selection) # Set variable anyway 125 | return 126 | 127 | if channelslists_dir.is_dir(): 128 | try: 129 | # Get only filenames, filter for .txt, sort alphabetically 130 | channel_files = sorted([ 131 | f.name for f in channelslists_dir.glob("*.txt") if f.is_file() 132 | ]) 133 | if channel_files: 134 | default_selection = channel_files[0] # Default to the first file found 135 | # Configure the dropdown 136 | self.channel_list_dropdown.configure(values=channel_files, state="readonly") # Use readonly state 137 | self.log_message(f"Found channel lists: {', '.join(channel_files)}", "DEBUG") 138 | else: 139 | self.log_message(f"No .txt files found in {channelslists_dir}", "WARN") 140 | self.channel_list_dropdown.configure(values=[default_selection], state="disabled") 141 | 142 | except OSError as e: 143 | self.log_message(f"Error reading channel list directory {channelslists_dir}: {e}", "ERROR") 144 | self.channel_list_dropdown.configure(values=[f"Error reading lists"], state="disabled") 145 | default_selection = "Error reading lists" 146 | except Exception as e: 147 | self.log_message(f"Unexpected error scanning for channel lists: {e}", "ERROR") 148 | self.channel_list_dropdown.configure(values=[f"Error scanning lists"], state="disabled") 149 | default_selection = "Error scanning lists" 150 | else: 151 | self.log_message(f"Channel list directory not found: {channelslists_dir}", "WARN") 152 | self.channel_list_dropdown.configure(values=[default_selection], state="disabled") 153 | 154 | # Set the variable for the dropdown 155 | self.channellist_path.set(default_selection) 156 | 157 | 158 | # --- Method Delegation to Handlers --- 159 | # These methods provide a clean interface and delegate the actual work 160 | # to the GuiEventHandlers instance. 161 | 162 | def open_file_dialog(self): # Deprecated method 163 | self.handlers.open_file_dialog() 164 | 165 | def validate_date_spinbox(self, prefix: str): 166 | self.handlers.validate_date_spinbox(prefix) 167 | 168 | def log_message(self, message: str, level: str = "INFO"): 169 | """Logs a message via the handler (which queues it).""" 170 | self.handlers.log_message(message, level) 171 | 172 | def process_log_queue(self): 173 | """Starts or continues processing the log queue via the handler.""" 174 | self.handlers.process_log_queue() 175 | 176 | def start_scraping_base(self, mode: str): 177 | """Initiates scraping via the handler.""" 178 | self.handlers.start_scraping_base(mode) 179 | 180 | def stop_scraping(self): 181 | """Stops scraping via the handler.""" 182 | self.handlers.stop_scraping() 183 | 184 | def disable_action_buttons(self): 185 | """Disables buttons during scraping via the handler.""" 186 | self.handlers.disable_action_buttons() 187 | 188 | def reset_buttons(self): 189 | """Resets button states via the handler.""" 190 | self.handlers.reset_buttons() 191 | 192 | def on_closing(self): 193 | """Handles window closing via the handler.""" 194 | self.handlers.on_closing() 195 | 196 | # Note: The actual Tkinter mainloop is called in getTelegram.py, not here. -------------------------------------------------------------------------------- /src/gui/ui_components.py: -------------------------------------------------------------------------------- 1 | import customtkinter as ctk 2 | from tkinter import ttk 3 | from datetime import date 4 | import os 5 | import webbrowser # Import webbrowser for opening links 6 | 7 | # Import config only for CUTOFF_DATE display/limits 8 | from src.config import CUTOFF_DATE 9 | 10 | # Standard padding values 11 | PAD_X = 10 12 | PAD_Y = 5 13 | INNER_PAD_X = 5 14 | INNER_PAD_Y = 5 15 | 16 | # --- File Selection, Date Pickers, Action Buttons --- 17 | # (Keep create_file_selection_ui, create_specific_date_picker_ui, 18 | # create_date_range_picker_ui, create_action_buttons_ui as they were 19 | # in the customtkinter version from the previous steps) 20 | # Example placeholder for one function: 21 | def create_file_selection_ui(master_frame: ctk.CTk, app_instance): 22 | """Creates the UI section for selecting the channel list file using a dropdown.""" 23 | file_frame = ctk.CTkFrame(master_frame) 24 | file_frame.pack(padx=PAD_X, pady=(PAD_Y * 2, PAD_Y), fill="x", anchor="n") 25 | section_label = ctk.CTkLabel(file_frame, text="1. Select Channel List", font=ctk.CTkFont(weight="bold")) 26 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0)) 27 | app_instance.channel_list_dropdown = ctk.CTkComboBox( 28 | file_frame, variable=app_instance.channellist_path, state="readonly", width=250 29 | ) 30 | app_instance.channel_list_dropdown.pack(pady=INNER_PAD_Y, padx=INNER_PAD_X) 31 | info_label = ctk.CTkLabel( file_frame, text="Select a list. Lists are loaded from 'channelslists' folder.", font=ctk.CTkFont(size=10), text_color="gray") 32 | info_label.pack(pady=(0, INNER_PAD_Y), padx=INNER_PAD_X) 33 | return file_frame 34 | 35 | # --- (Include the other create_*_ui functions here from previous steps) --- 36 | def create_specific_date_picker_ui(master_frame: ctk.CTk, app_instance): 37 | # ... (Implementation from previous step) ... 38 | date_frame = ctk.CTkFrame(master_frame) 39 | date_frame.pack(padx=PAD_X, pady=PAD_Y, fill="x", anchor="n") 40 | section_label = ctk.CTkLabel(date_frame, text="2a. Download for Specific Date", font=ctk.CTkFont(weight="bold")) 41 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0)) 42 | date_picker_inner_frame = ctk.CTkFrame(date_frame, fg_color="transparent") 43 | date_picker_inner_frame.pack(pady=INNER_PAD_Y, fill="x", padx=INNER_PAD_X) 44 | date_spin_frame = ctk.CTkFrame(date_picker_inner_frame, fg_color="transparent") 45 | date_spin_frame.pack(side="left", padx=(0, PAD_X)) 46 | current_year = date.today().year; min_year = CUTOFF_DATE.year 47 | ctk.CTkLabel(date_spin_frame, text="Day:", width=30).pack(side="left", padx=(0, 2)) 48 | app_instance.day_spinbox = ttk.Spinbox(date_spin_frame, from_=1, to=31, textvariable=app_instance.sel_day, width=4, command=lambda: app_instance.validate_date_spinbox('sel')) 49 | app_instance.day_spinbox.pack(side="left", padx=(0, 8)) 50 | ctk.CTkLabel(date_spin_frame, text="Month:", width=40).pack(side="left", padx=(0, 2)) 51 | app_instance.month_spinbox = ttk.Spinbox( date_spin_frame, from_=1, to=12, textvariable=app_instance.sel_month, width=4, command=lambda: app_instance.validate_date_spinbox('sel')) 52 | app_instance.month_spinbox.pack(side="left", padx=(0, 8)) 53 | ctk.CTkLabel(date_spin_frame, text="Year:", width=35).pack(side="left", padx=(0, 2)) 54 | app_instance.year_spinbox = ttk.Spinbox( date_spin_frame, from_=min_year, to=current_year, textvariable=app_instance.sel_year, width=6, command=lambda: app_instance.validate_date_spinbox('sel')) 55 | app_instance.year_spinbox.pack(side="left") 56 | app_instance.specific_date_button = ctk.CTkButton(date_picker_inner_frame, text="Download This Date", command=lambda: app_instance.start_scraping_base('specific_date'), width=160) 57 | app_instance.specific_date_button.pack(side="left", padx=(PAD_X, 0)) 58 | return date_frame 59 | 60 | def create_date_range_picker_ui(master_frame: ctk.CTk, app_instance): 61 | # ... (Implementation from previous step) ... 62 | range_frame = ctk.CTkFrame(master_frame) 63 | range_frame.pack(padx=PAD_X, pady=PAD_Y, fill="x", anchor="n") 64 | section_label = ctk.CTkLabel(range_frame, text="2b. Download Date Range", font=ctk.CTkFont(weight="bold")) 65 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0)) 66 | current_year = date.today().year; min_year = CUTOFF_DATE.year; label_width = 70 67 | start_frame = ctk.CTkFrame(range_frame, fg_color="transparent") 68 | start_frame.pack(pady=(INNER_PAD_Y, 2), fill="x", padx=INNER_PAD_X) 69 | ctk.CTkLabel(start_frame, text="Start Date:", width=label_width, anchor='w').pack(side="left", padx=(0, INNER_PAD_X)) 70 | start_spin_frame = ctk.CTkFrame(start_frame, fg_color="transparent") 71 | start_spin_frame.pack(side="left") 72 | ctk.CTkLabel(start_spin_frame, text="D:", width=15).pack(side="left", padx=(0, 1)) 73 | app_instance.start_day_spinbox = ttk.Spinbox(start_spin_frame, from_=1, to=31, width=4, textvariable=app_instance.start_day, command=lambda: app_instance.validate_date_spinbox('start')) 74 | app_instance.start_day_spinbox.pack(side="left", padx=(0, 5)) 75 | ctk.CTkLabel(start_spin_frame, text="M:", width=15).pack(side="left", padx=(0, 1)) 76 | app_instance.start_month_spinbox = ttk.Spinbox(start_spin_frame, from_=1, to=12, width=4, textvariable=app_instance.start_month, command=lambda: app_instance.validate_date_spinbox('start')) 77 | app_instance.start_month_spinbox.pack(side="left", padx=(0, 5)) 78 | ctk.CTkLabel(start_spin_frame, text="Y:", width=15).pack(side="left", padx=(0, 1)) 79 | app_instance.start_year_spinbox = ttk.Spinbox(start_spin_frame, from_=min_year, to=current_year, width=6, textvariable=app_instance.start_year, command=lambda: app_instance.validate_date_spinbox('start')) 80 | app_instance.start_year_spinbox.pack(side="left") 81 | end_frame = ctk.CTkFrame(range_frame, fg_color="transparent") 82 | end_frame.pack(pady=2, fill="x", padx=INNER_PAD_X) 83 | ctk.CTkLabel(end_frame, text="End Date:", width=label_width, anchor='w').pack(side="left", padx=(0, INNER_PAD_X)) 84 | end_spin_frame = ctk.CTkFrame(end_frame, fg_color="transparent") 85 | end_spin_frame.pack(side="left") 86 | ctk.CTkLabel(end_spin_frame, text="D:", width=15).pack(side="left", padx=(0, 1)) 87 | app_instance.end_day_spinbox = ttk.Spinbox(end_spin_frame, from_=1, to=31, width=4, textvariable=app_instance.end_day, command=lambda: app_instance.validate_date_spinbox('end')) 88 | app_instance.end_day_spinbox.pack(side="left", padx=(0, 5)) 89 | ctk.CTkLabel(end_spin_frame, text="M:", width=15).pack(side="left", padx=(0, 1)) 90 | app_instance.end_month_spinbox = ttk.Spinbox(end_spin_frame, from_=1, to=12, width=4, textvariable=app_instance.end_month, command=lambda: app_instance.validate_date_spinbox('end')) 91 | app_instance.end_month_spinbox.pack(side="left", padx=(0, 5)) 92 | ctk.CTkLabel(end_spin_frame, text="Y:", width=15).pack(side="left", padx=(0, 1)) 93 | app_instance.end_year_spinbox = ttk.Spinbox(end_spin_frame, from_=min_year, to=current_year, width=6, textvariable=app_instance.end_year, command=lambda: app_instance.validate_date_spinbox('end')) 94 | app_instance.end_year_spinbox.pack(side="left") 95 | button_frame = ctk.CTkFrame(range_frame, fg_color="transparent") 96 | button_frame.pack(pady=(INNER_PAD_Y * 2, INNER_PAD_Y)) 97 | app_instance.range_date_button = ctk.CTkButton(button_frame, text="Download Date Range", command=lambda: app_instance.start_scraping_base('date_range'), width=180) 98 | app_instance.range_date_button.pack() 99 | min_date_str = CUTOFF_DATE.strftime('%Y-%m-%d') 100 | ctk.CTkLabel(range_frame, text=f"Note: Data is available from {min_date_str} onwards.", text_color="gray").pack(pady=(0, INNER_PAD_Y), anchor='center') 101 | return range_frame 102 | 103 | def create_action_buttons_ui(master_frame: ctk.CTk, app_instance): 104 | # ... (Implementation from previous step) ... 105 | actions_frame = ctk.CTkFrame(master_frame) 106 | actions_frame.pack(padx=PAD_X, pady=PAD_Y, fill="x", anchor="n") 107 | section_label = ctk.CTkLabel(actions_frame, text="2c. Quick Actions / All / Stop", font=ctk.CTkFont(weight="bold")) 108 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0)) 109 | button_inner_frame = ctk.CTkFrame(actions_frame, fg_color="transparent") 110 | button_inner_frame.pack(pady=INNER_PAD_Y) 111 | app_instance.today_button = ctk.CTkButton(button_inner_frame, text="Download Today", command=lambda: app_instance.start_scraping_base('today'), width=150) 112 | app_instance.today_button.pack(side="left", padx=INNER_PAD_X) 113 | app_instance.yesterday_button = ctk.CTkButton(button_inner_frame, text="Download Yesterday", command=lambda: app_instance.start_scraping_base('yesterday'), width=150) 114 | app_instance.yesterday_button.pack(side="left", padx=INNER_PAD_X) 115 | all_button_text = f"Download All (since {CUTOFF_DATE.year})" 116 | app_instance.all_button = ctk.CTkButton(button_inner_frame, text=all_button_text, command=lambda: app_instance.start_scraping_base('all'), width=180) 117 | app_instance.all_button.pack(side="left", padx=INNER_PAD_X) 118 | app_instance.stop_button = ctk.CTkButton(actions_frame, text="STOP SCRAPING", command=app_instance.stop_scraping, state="disabled", width=200, fg_color="#D32F2F", hover_color="#B71C1C", text_color="white", font=ctk.CTkFont(weight="bold")) 119 | app_instance.stop_button.pack(pady=(INNER_PAD_Y, INNER_PAD_Y*2)) 120 | ctk.CTkLabel(actions_frame, text="Warning: 'Download All' can take long & create many files!", text_color="#FF8C00").pack(pady=(0,INNER_PAD_Y)) 121 | return actions_frame 122 | # --- End Placeholder --- 123 | 124 | 125 | def create_log_ui(master_frame: ctk.CTk, app_instance): 126 | """Creates the scrollable logging text area.""" 127 | log_frame = ctk.CTkFrame(master_frame) 128 | # Make log frame expand vertically in the main content area 129 | log_frame.pack(padx=PAD_X, pady=PAD_Y, fill="both", expand=True) 130 | 131 | section_label = ctk.CTkLabel(log_frame, text="Logs", font=ctk.CTkFont(weight="bold")) 132 | section_label.pack(anchor="w", padx=INNER_PAD_X, pady=(INNER_PAD_Y, 0)) 133 | 134 | app_instance.log_text = ctk.CTkTextbox( 135 | log_frame, wrap="word", height=150, state="disabled" 136 | ) 137 | app_instance.log_text.pack(fill="both", expand=True, padx=INNER_PAD_X, pady=INNER_PAD_Y) 138 | 139 | # Configure tags (keep as before) 140 | colors = {"ERROR": "#FF0000", "WARN": "#FFA500", "INFO": "#007ACC", "DEBUG": "#808080"} 141 | for tag, color in colors.items(): 142 | app_instance.log_text.tag_config(tag, foreground=color) 143 | 144 | # --- REMOVED call to create_analysis_info_ui --- 145 | 146 | return log_frame 147 | 148 | # --- NEW/MODIFIED Analysis Info UI for Sidebar --- 149 | def create_analysis_info_ui(master_frame: ctk.CTk, app_instance): 150 | """Creates the informational section for the sidebar with a clickable link.""" 151 | # The master_frame is now the sidebar frame passed from main_window 152 | # No need to create another frame inside unless needed for padding/structure 153 | master_frame.configure(fg_color="transparent") # Make sidebar background transparent if desired 154 | 155 | section_label = ctk.CTkLabel(master_frame, text="3. Data Analysis Tip", font=ctk.CTkFont(weight="bold")) 156 | section_label.pack(pady=(5, 5), padx=INNER_PAD_X, anchor='w') # Use pack directly into sidebar frame 157 | 158 | # --- Text and Link Handling --- 159 | # Define parts of the text and the URL 160 | text_part1 = "After downloading, analyze the 'output_*.txt' files using RAG tools.\nA recommended tool is Google's NotebookLM:" 161 | url = "https://notebooklm.google.com/" 162 | text_part2 = "\nUpload the files there to ask questions about the content. Feel free to explore other tools." 163 | 164 | # Set a wrap length appropriate for the sidebar width (adjust if needed) 165 | sidebar_wrap_length = 180 166 | 167 | # Create label for the text before the link 168 | label_part1 = ctk.CTkLabel( 169 | master_frame, 170 | text=text_part1, 171 | justify=ctk.LEFT, 172 | anchor='w', 173 | wraplength=sidebar_wrap_length 174 | ) 175 | label_part1.pack(pady=(0, 2), padx=INNER_PAD_X, fill='x') 176 | 177 | # Create the clickable link label 178 | link_label = ctk.CTkLabel( 179 | master_frame, 180 | text=url, 181 | text_color="cornflowerblue", # Standard link color 182 | cursor="hand2", # Change cursor on hover 183 | justify=ctk.LEFT, 184 | anchor='w', 185 | wraplength=sidebar_wrap_length 186 | ) 187 | link_label.pack(pady=2, padx=INNER_PAD_X, fill='x') 188 | # Bind left mouse click to open the URL 189 | link_label.bind("", lambda event: webbrowser.open_new(url)) 190 | # Optional: Add underline 191 | # link_font = ctk.CTkFont(underline=True) 192 | # link_label.configure(font=link_font) 193 | 194 | 195 | # Create label for the text after the link 196 | label_part2 = ctk.CTkLabel( 197 | master_frame, 198 | text=text_part2, 199 | justify=ctk.LEFT, 200 | anchor='w', 201 | wraplength=sidebar_wrap_length 202 | ) 203 | label_part2.pack(pady=(2, 5), padx=INNER_PAD_X, fill='x') 204 | 205 | # Return the master_frame (sidebar) itself, although not strictly needed 206 | return master_frame -------------------------------------------------------------------------------- /src/scraper/__init__.py: -------------------------------------------------------------------------------- 1 | # src/scraper/__init__.py 2 | # This file makes 'scraper' a Python package. 3 | from .core_logic import run_scraping -------------------------------------------------------------------------------- /src/scraper/core_logic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re # Import re 3 | import threading 4 | import time 5 | from datetime import date, datetime 6 | from pathlib import Path # Use pathlib for path operations 7 | from typing import List, Dict, Optional, TextIO, Tuple, Callable, Any # Added Any 8 | 9 | # Import client and models from the sibling package 10 | try: 11 | from my_telegram_scrapper import SimpleScraperClient, SimpleTgPost, ScrapedPage 12 | except ImportError as e: 13 | # This error should ideally be caught at the application entry point, 14 | # but raise it here too for clarity if this module is used independently. 15 | raise ImportError("Could not import from 'my_telegram_scrapper'. Is it installed or in PYTHONPATH?") from e 16 | 17 | # Import configuration and utilities 18 | from src.config import CUTOFF_DATE 19 | from src.utils.file_utils import archive_old_output_files, load_channels 20 | 21 | # --- Helper Functions --- 22 | 23 | def _determine_date_range( 24 | mode: str, target_date: Optional[date], start_date: Optional[date], end_date: Optional[date] 25 | ) -> Tuple[date, date, str]: 26 | """ 27 | Determines the effective start and end dates for scraping based on the mode. 28 | Also generates a string describing the date criteria for logging. 29 | 30 | Returns: 31 | A tuple containing (effective_start_date, effective_end_date, log_date_info_string). 32 | Raises: 33 | ValueError if required dates for a mode are missing or invalid range. 34 | """ 35 | log_date_info = "" 36 | # Default range is from CUTOFF_DATE up to today 37 | effective_start_date = CUTOFF_DATE 38 | effective_end_date = date.today() 39 | 40 | if mode in ['today', 'yesterday', 'specific_date']: 41 | if target_date is None: 42 | raise ValueError(f"Target date is required for mode '{mode}'.") 43 | # For single-date modes, start and end are the same 44 | effective_start_date = target_date 45 | effective_end_date = target_date 46 | log_date_info = f" for date {target_date.strftime('%Y-%m-%d')}" 47 | elif mode == 'date_range': 48 | if start_date is None or end_date is None: 49 | raise ValueError("Start and end dates are required for 'date_range' mode.") 50 | # Ensure range start is not before the absolute cutoff 51 | effective_start_date = max(start_date, CUTOFF_DATE) 52 | effective_end_date = end_date 53 | log_date_info = f" for range {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}" 54 | # Add effective range info if start date was adjusted by cutoff 55 | if start_date < CUTOFF_DATE: 56 | log_date_info += f" (effective start: {effective_start_date.strftime('%Y-%m-%d')})" 57 | elif mode == 'all': 58 | # Uses the default range (CUTOFF_DATE to today) 59 | log_date_info = f" (since {CUTOFF_DATE.strftime('%Y-%m-%d')})" 60 | # effective_start_date and effective_end_date remain as defaults 61 | 62 | # Final validation: end date should not be before start date (can happen with cutoff adjustment) 63 | if effective_end_date < effective_start_date: 64 | raise ValueError(f"Effective end date ({effective_end_date.strftime('%Y-%m-%d')}) cannot be before effective start date ({effective_start_date.strftime('%Y-%m-%d')}).") 65 | 66 | return effective_start_date, effective_end_date, log_date_info 67 | 68 | 69 | def _write_post_to_file(handle: TextIO, channel: str, post: SimpleTgPost): 70 | """Formats and writes a single post to an open file handle.""" 71 | try: 72 | post_content = post.content or "[No text content]" 73 | # Clean potential multiple newlines or excessive whitespace 74 | post_content = re.sub(r'\s{2,}', ' ', post_content).strip() # Use re.sub 75 | 76 | post_url_str = post.post_url or "[No URL]" 77 | post_time_str = post.timestamp.strftime('%H:%M:%S') if post.timestamp else "[No Time]" 78 | # Format: ChannelName | URL (HH:MM:SS) : Content 79 | post_info = f"{channel} | {post_url_str} ({post_time_str}) : {post_content}\n" 80 | handle.write(post_info) 81 | except Exception as e: 82 | # Log error but don't stop the whole process for one write failure 83 | # Ideally use log_callback if available, else print 84 | print(f"Error writing post {post.post_url} for channel {channel}: {e}") 85 | 86 | def _get_output_file_handle( 87 | post_date_str: str, output_dir: Path, base_list_name: str, 88 | open_files: Dict[str, TextIO], output_files_created: List[Path], log_callback: Callable 89 | ) -> Optional[TextIO]: 90 | """Gets or creates the file handle for a specific date.""" 91 | if post_date_str in open_files: 92 | return open_files[post_date_str] 93 | 94 | file_path = output_dir / f"output_{base_list_name}_{post_date_str}.txt" 95 | try: 96 | # Use 'a' mode (append) 97 | handle = open(file_path, "a", encoding="utf-8") 98 | # Write header only if the file is newly created (or empty) 99 | if file_path.stat().st_size == 0: 100 | handle.write(f"### Posts from {post_date_str} (List: {base_list_name})\n\n") 101 | open_files[post_date_str] = handle 102 | # Track the file path if newly opened/created 103 | if file_path not in output_files_created: 104 | output_files_created.append(file_path) 105 | log_callback(f"Opened output file: {file_path.name}", "DEBUG") # Debug level log 106 | return handle 107 | except OSError as e: 108 | log_callback(f"Failed to open/write header to output file {file_path}: {e}", "ERROR") 109 | return None 110 | 111 | 112 | def _process_scraped_post( 113 | post: SimpleTgPost, channel: str, mode: str, 114 | effective_start_date: date, effective_end_date: date, 115 | output_dir: Path, base_list_name: str, 116 | open_files: Dict[str, TextIO], output_files_created: List[Path], 117 | all_posts_for_specific_date: List[Tuple[str, SimpleTgPost]], 118 | log_callback: Callable 119 | ) -> bool: 120 | """ 121 | Checks if a post matches date criteria and writes it to the appropriate file/list. 122 | Returns True if the post was processed, False otherwise. 123 | """ 124 | if not post.timestamp: 125 | return False # Cannot process without a timestamp 126 | 127 | current_post_date = post.timestamp.date() 128 | 129 | # --- Date Filtering --- 130 | # 1. Check against absolute CUTOFF_DATE (skip if older) 131 | if current_post_date < CUTOFF_DATE: 132 | return False 133 | 134 | # 2. Check against mode-specific date range 135 | is_within_target_range = False 136 | if mode == 'all': 137 | # Already passed CUTOFF check, so it's valid for 'all' 138 | is_within_target_range = True 139 | elif mode == 'date_range': 140 | # Check if within the effective start/end dates 141 | is_within_target_range = (effective_start_date <= current_post_date <= effective_end_date) 142 | elif mode in ['today', 'yesterday', 'specific_date']: 143 | # Check if it matches the single target date (start date = end date = target date) 144 | is_within_target_range = (current_post_date == effective_start_date) 145 | 146 | if not is_within_target_range: 147 | return False # Post date does not match the required criteria for the mode 148 | 149 | # --- Process Matching Post --- 150 | processed = False 151 | if mode == 'all' or mode == 'date_range': 152 | # Write directly to the file corresponding to the post's date 153 | post_date_str = current_post_date.strftime("%Y-%m-%d") 154 | handle = _get_output_file_handle( 155 | post_date_str, output_dir, base_list_name, 156 | open_files, output_files_created, log_callback 157 | ) 158 | if handle: 159 | _write_post_to_file(handle, channel, post) 160 | processed = True 161 | # else: Error getting handle was logged by _get_output_file_handle 162 | 163 | elif mode in ['today', 'yesterday', 'specific_date']: 164 | # Collect posts for single-date modes to write later (allows sorting) 165 | all_posts_for_specific_date.append((channel, post)) # Store channel name with post 166 | processed = True 167 | 168 | return processed 169 | 170 | 171 | def _scrape_single_channel( 172 | client: SimpleScraperClient, channel: str, mode: str, 173 | effective_start_date: date, effective_end_date: date, 174 | log_callback: Callable, stop_event: threading.Event, 175 | output_dir: Path, base_list_name: str, 176 | open_files: Dict[str, TextIO], output_files_created: List[Path], 177 | all_posts_for_specific_date: List[Tuple[str, SimpleTgPost]] 178 | ) -> int: 179 | """ 180 | Scrapes posts for a single channel, handling pagination and date filtering. 181 | 182 | Returns: 183 | The number of posts successfully processed for this channel matching criteria. 184 | """ 185 | next_page_token: Optional[str] = None 186 | pages_checked = 0 187 | processed_posts_count = 0 # Posts processed *for this channel* matching criteria 188 | stop_channel_pagination = False 189 | last_oldest_date_on_page: Optional[date] = None # Track oldest date seen 190 | 191 | log_callback(f"Starting channel: {channel}", "DEBUG") 192 | 193 | while not stop_channel_pagination: 194 | if stop_event.is_set(): 195 | log_callback(f"Stop signal received, interrupting channel {channel}.", "WARN") 196 | break # Break inner loop (pagination) 197 | 198 | pages_checked += 1 199 | log_callback(f" Fetching page {pages_checked} for {channel} (Token: {next_page_token or 'None'})...", "DEBUG") 200 | 201 | # --- Fetch Page --- 202 | try: 203 | page_data = client.get_channel_page(channel, before_token=next_page_token) 204 | except Exception as fetch_e: # Catch errors during fetch/parse at client level 205 | log_callback(f"Error fetching/parsing page {pages_checked} for {channel}: {fetch_e}", "ERROR") 206 | stop_channel_pagination = True # Stop processing this channel on error 207 | continue # Skip to next channel or finish 208 | 209 | if not page_data or not page_data.posts: 210 | log_callback(f" No more posts found or page error for {channel} on page {pages_checked}.", "INFO") 211 | stop_channel_pagination = True 212 | continue 213 | 214 | # --- Process Posts on Page --- 215 | posts_on_page = page_data.posts 216 | oldest_post_date_this_page: Optional[date] = None 217 | posts_processed_this_page = 0 218 | 219 | for post in posts_on_page: 220 | if post.timestamp: 221 | current_post_date = post.timestamp.date() 222 | # Update oldest date seen on this specific page 223 | if oldest_post_date_this_page is None or current_post_date < oldest_post_date_this_page: 224 | oldest_post_date_this_page = current_post_date 225 | 226 | # Process the post (checks dates, writes/collects) 227 | if _process_scraped_post( 228 | post, channel, mode, effective_start_date, effective_end_date, 229 | output_dir, base_list_name, open_files, output_files_created, 230 | all_posts_for_specific_date, log_callback 231 | ): 232 | posts_processed_this_page += 1 233 | 234 | if posts_processed_this_page > 0: 235 | log_callback(f" Processed {posts_processed_this_page} matching posts from page {pages_checked}.", "DEBUG") 236 | # Log if posts existed but none matched criteria for this specific page 237 | elif posts_on_page: 238 | log_callback(f" No posts on page {pages_checked} matched date criteria for mode '{mode}'.", "DEBUG") 239 | 240 | processed_posts_count += posts_processed_this_page 241 | last_oldest_date_on_page = oldest_post_date_this_page # Store for pagination logic 242 | 243 | # --- Pagination Stop Conditions --- 244 | next_page_token = page_data.next_page_token 245 | if not next_page_token: 246 | log_callback(f" End of channel history reached for {channel} (no next page token).", "INFO") 247 | stop_channel_pagination = True 248 | continue 249 | 250 | # Stop if the oldest post found on the page is before the required start date 251 | if last_oldest_date_on_page: 252 | if last_oldest_date_on_page < effective_start_date: 253 | log_callback(f" Oldest post on page ({last_oldest_date_on_page.strftime('%Y-%m-%d')}) is before target start date ({effective_start_date.strftime('%Y-%m-%d')}). Stopping pagination for {channel}.", "INFO") 254 | stop_channel_pagination = True 255 | continue 256 | # Add a safety break if pages_checked gets excessively high? 257 | if pages_checked > 500: # Arbitrary limit to prevent infinite loops on weird pages 258 | log_callback(f"Warning: Exceeded 500 pages for channel {channel}. Stopping pagination.", "WARN") 259 | stop_channel_pagination = True 260 | continue 261 | 262 | # Optional: Short delay between page requests 263 | # time.sleep(0.1) # Be mindful of rate limiting 264 | 265 | log_callback(f"Finished channel {channel}. Found {processed_posts_count} matching posts.", "INFO") 266 | return processed_posts_count 267 | 268 | # --- Main Scraping Function --- 269 | def scrape_channels( 270 | channellist_file: str, mode: str, 271 | target_date: Optional[date], start_date: Optional[date], end_date: Optional[date], 272 | log_callback: Callable, stop_event: threading.Event, 273 | output_dir: Path 274 | ) -> List[Path]: 275 | """ 276 | Scrapes posts from channels listed in a file based on mode and date criteria. 277 | 278 | Args: 279 | channellist_file: Path to the file containing channel names/URLs. 280 | mode: Scraping mode ('today', 'yesterday', 'specific_date', 'date_range', 'all'). 281 | target_date: The specific date for single-date modes. 282 | start_date: Start date for range mode. 283 | end_date: End date for range mode. 284 | log_callback: Function to call for logging messages to the GUI/console. 285 | stop_event: Threading event to signal stopping the process. 286 | output_dir: Path object for the directory to save output files. 287 | 288 | Returns: 289 | A list of Path objects for the output files created or updated. 290 | Raises: 291 | ValueError, FileNotFoundError, RuntimeError on critical errors. 292 | """ 293 | output_files_created: List[Path] = [] 294 | # Get the base name of the channel list file (e.g., "proRuChannels") 295 | base_list_name = Path(channellist_file).stem 296 | 297 | try: 298 | effective_start_date, effective_end_date, log_date_info = _determine_date_range( 299 | mode, target_date, start_date, end_date 300 | ) 301 | except ValueError as e: 302 | log_callback(f"Date range error: {e}", "ERROR") 303 | raise e # Re-raise for the caller (GUI thread) 304 | 305 | log_callback(f"Starting scraping process. Mode: '{mode}'{log_date_info}", "INFO") 306 | log_callback(f"Effective date range: {effective_start_date.strftime('%Y-%m-%d')} to {effective_end_date.strftime('%Y-%m-%d')}", "DEBUG") 307 | 308 | # Load channels (handles its own file errors) 309 | channels = load_channels(channellist_file, log_callback) 310 | if not channels: # load_channels should raise error if file empty/not found, but double-check 311 | log_callback("Channel list is empty or could not be loaded.", "ERROR") 312 | raise ValueError("Channel list is empty.") # Raise error to stop process 313 | 314 | # Dictionary to hold open file handles {date_str: file_handle} for range/all modes 315 | open_files: Dict[str, TextIO] = {} 316 | # List to store posts for single-date modes before writing 317 | all_posts_for_specific_date: List[Tuple[str, SimpleTgPost]] = [] 318 | total_processed_posts = 0 319 | 320 | try: 321 | # Use the client as a context manager 322 | with SimpleScraperClient() as client: 323 | log_callback(f"Processing {len(channels)} channels from {Path(channellist_file).name}...", "INFO") 324 | for i, channel in enumerate(channels): 325 | if stop_event.is_set(): 326 | log_callback("Stop signal received. Aborting channel processing.", "WARN") 327 | break # Break outer loop (channel iteration) 328 | 329 | log_callback(f"--- Channel {i+1}/{len(channels)}: {channel} ---", "INFO") 330 | 331 | # Scrape the current channel 332 | processed_count = _scrape_single_channel( 333 | client, channel, mode, effective_start_date, effective_end_date, 334 | log_callback, stop_event, output_dir, base_list_name, 335 | open_files, output_files_created, all_posts_for_specific_date 336 | ) 337 | total_processed_posts += processed_count 338 | 339 | except Exception as client_error: 340 | # Catch unexpected errors during client usage or scraping loop 341 | log_callback(f"Critical error during scraping: {client_error}", "ERROR") 342 | # Raise a runtime error to signal failure to the calling thread 343 | raise RuntimeError(f"Scraping failed due to an unexpected error: {client_error}") from client_error 344 | finally: 345 | # --- Cleanup: Close all files opened in range/all mode --- 346 | if open_files: 347 | log_callback(f"Closing {len(open_files)} output files...", "INFO") 348 | closed_count = 0 349 | for date_str, handle in open_files.items(): 350 | try: 351 | if handle and not handle.closed: 352 | handle.close() 353 | closed_count += 1 354 | except Exception as close_e: 355 | log_callback(f"Error closing file for date {date_str}: {close_e}", "ERROR") 356 | log_callback(f"Closed {closed_count} files.", "DEBUG") 357 | 358 | # --- Write collected posts for single-date modes --- 359 | if mode in ['today', 'yesterday', 'specific_date'] and all_posts_for_specific_date: 360 | if target_date is None: 361 | # This shouldn't happen if date validation passed, but check defensively 362 | log_callback("Cannot write single-date file: Target date is missing.", "ERROR") 363 | else: 364 | output_file_path = output_dir / f"output_{base_list_name}_{target_date.strftime('%Y-%m-%d')}.txt" 365 | log_callback(f"Writing {len(all_posts_for_specific_date)} collected posts to {output_file_path.name}...", "INFO") 366 | try: 367 | # Sort posts by timestamp before writing for chronological order 368 | all_posts_for_specific_date.sort( 369 | key=lambda item: item[1].timestamp or datetime.min # Sort by post timestamp 370 | ) 371 | # Use 'w' mode (write/overwrite) for single-date files 372 | with open(output_file_path, "w", encoding="utf-8") as outfile: 373 | outfile.write(f"### Posts from {target_date.strftime('%Y-%m-%d')} (List: {base_list_name})\n\n") 374 | for channel_name, post in all_posts_for_specific_date: 375 | _write_post_to_file(outfile, channel_name, post) 376 | 377 | # Add the file path to the list of created files if not already present 378 | if output_file_path not in output_files_created: 379 | output_files_created.append(output_file_path) 380 | log_callback(f"Successfully wrote single-date file: {output_file_path.name}", "INFO") 381 | except OSError as write_e: 382 | log_callback(f"Failed to write output file {output_file_path.name}: {write_e}", "ERROR") 383 | # Optionally remove from created list if write failed partway? 384 | if output_file_path in output_files_created: 385 | output_files_created.remove(output_file_path) 386 | 387 | # --- Final Logging --- 388 | total_files = len(output_files_created) 389 | if stop_event.is_set(): 390 | log_callback(f"Scraping interrupted. Processed {total_processed_posts} posts into {total_files} files before stopping.", "WARN") 391 | elif total_processed_posts == 0: # Check if *any* posts matching criteria were found across all channels 392 | log_callback(f"Scraping finished. No posts found matching the specified criteria{log_date_info}.", "INFO") 393 | else: 394 | log_callback(f"Scraping finished successfully. Processed {total_processed_posts} posts into {total_files} files.", "INFO") 395 | 396 | return output_files_created 397 | 398 | 399 | # --- Runner Function (called by the GUI thread) --- 400 | def run_scraping( 401 | channellist_file: str, mode: str, 402 | target_date: Optional[date], start_date: Optional[date], end_date: Optional[date], 403 | log_callback: Callable[[str, str], None], stop_event: threading.Event, base_dir: str 404 | ) -> List[str]: 405 | """ 406 | Entry point called by the GUI thread. Handles setup (archiving) and calls the main scraping logic. 407 | 408 | Args: 409 | base_dir: The application's base directory (string). Other args are as in scrape_channels. 410 | log_callback: Adjusted signature for level 411 | 412 | Returns: 413 | A list of string paths for the output files created or updated. 414 | Raises: 415 | Exceptions caught during setup or scraping, to be handled by the calling GUI thread. 416 | """ 417 | output_files: List[Path] = [] # List of Path objects 418 | base_dir_path = Path(base_dir) 419 | output_dir = base_dir_path # Output files go directly into the base directory 420 | 421 | try: 422 | # 1. Archive existing output files before starting 423 | archive_old_output_files(str(base_dir_path), log_callback) # Pass base_dir as string if util expects it 424 | 425 | if stop_event.is_set(): 426 | log_callback("Process stopped during archiving phase.", "WARN") 427 | return [] # Return empty list if stopped early 428 | 429 | # 2. Run the main scraping function 430 | log_callback("Archiving complete. Starting channel processing...", "INFO") 431 | output_files = scrape_channels( 432 | channellist_file=channellist_file, 433 | mode=mode, 434 | target_date=target_date, 435 | start_date=start_date, 436 | end_date=end_date, 437 | log_callback=log_callback, # Pass the callback directly 438 | stop_event=stop_event, 439 | output_dir=output_dir # Pass the Path object 440 | ) 441 | # Convert Path objects back to strings for the GUI handler if needed 442 | return [str(f) for f in output_files] 443 | 444 | except (FileNotFoundError, ValueError, RuntimeError, NameError, ImportError) as e: 445 | # Log errors originating from setup or scraping logic 446 | log_callback(f"Scraping process failed: {e}", "ERROR") 447 | # Re-raise the exception to be caught by the calling thread (GUI) 448 | # This allows the GUI to show the specific error message. 449 | raise e 450 | except Exception as e: 451 | # Log unexpected critical errors during the overall process 452 | log_callback(f"An unexpected critical error occurred in run_scraping: {e}", "ERROR") 453 | # Optionally log traceback here 454 | # import traceback 455 | # log_callback(traceback.format_exc(), "ERROR") 456 | # Wrap in a RuntimeError for consistent handling by the GUI 457 | raise RuntimeError(f"Unexpected error during scraping execution: {e}") from e -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # This file makes 'utils' a Python package. 2 | from .file_utils import archive_old_output_files, load_channels -------------------------------------------------------------------------------- /src/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import shutil 4 | import random 5 | import time 6 | import re 7 | from pathlib import Path # Use pathlib for easier path operations 8 | from typing import List, Callable, Any # Added Any 9 | 10 | from src.config import ARCHIVE_DIR_NAME 11 | 12 | def archive_old_output_files(base_dir_str: str, log_callback: Callable[[str, str], None]): 13 | """ 14 | Moves existing output_*.txt files from the base directory to an archive subfolder. 15 | """ 16 | base_dir = Path(base_dir_str) 17 | archive_path = base_dir / ARCHIVE_DIR_NAME 18 | 19 | try: 20 | # Create archive directory if it doesn't exist 21 | archive_created = False 22 | if not archive_path.exists(): 23 | archive_path.mkdir(parents=True, exist_ok=True) 24 | archive_created = True 25 | if archive_created and archive_path.exists(): # Check creation 26 | log_callback(f"Created archive directory: {archive_path}", "INFO") 27 | 28 | # Find output files in the base directory using pathlib's glob 29 | # Pattern: output_ followed by any characters until .txt 30 | output_files = list(base_dir.glob("output_*.txt")) 31 | 32 | if not output_files: 33 | log_callback("No previous output files found to archive.", "INFO") 34 | return 35 | 36 | log_callback(f"Found {len(output_files)} output file(s) to archive...", "INFO") 37 | archived_count = 0 38 | for file_path in output_files: 39 | try: 40 | # Create a unique archive filename 41 | base_name = file_path.stem # Name without extension 42 | timestamp_str = time.strftime("%Y%m%d_%H%M%S") 43 | random_num = random.randint(1000, 9999) 44 | # Keep the original extension (.txt) 45 | archive_name = f"{base_name}_{timestamp_str}_{random_num}{file_path.suffix}" 46 | destination_path = archive_path / archive_name 47 | 48 | # Move the file 49 | shutil.move(str(file_path), str(destination_path)) 50 | log_callback(f" Archived {file_path.name} to {archive_name}", "DEBUG") # Debug level 51 | archived_count += 1 52 | except OSError as e: 53 | log_callback(f"Error archiving file {file_path.name}: {e}", "ERROR") 54 | except Exception as e: # Catch other potential errors during move/naming 55 | log_callback(f"Unexpected error archiving {file_path.name}: {e}", "ERROR") 56 | 57 | log_callback(f"Archiving complete. Moved {archived_count} file(s).", "INFO") 58 | 59 | except OSError as e: 60 | log_callback(f"Error creating or accessing archive directory {archive_path}: {e}", "ERROR") 61 | except Exception as e: 62 | log_callback(f"General error during archiving process: {e}", "ERROR") 63 | 64 | 65 | def load_channels(channellist_file: str, log_callback: Callable[[str, str], None]) -> List[str]: 66 | """ 67 | Loads and validates channel names/URLs from the given text file. 68 | Extracts the channel username (part after the last '/'). 69 | 70 | Returns: 71 | A list of valid channel usernames. 72 | Raises: 73 | FileNotFoundError if the file doesn't exist. 74 | ValueError if the file contains no valid channel names. 75 | RuntimeError for other read errors. 76 | """ 77 | channels: List[str] = [] 78 | file_path = Path(channellist_file) 79 | 80 | if not file_path.is_file(): 81 | log_callback(f"Channel list file not found: {channellist_file}", "ERROR") 82 | raise FileNotFoundError(f"Channel list file not found: {channellist_file}") 83 | 84 | try: 85 | with file_path.open("r", encoding="utf-8") as infile: 86 | for line_num, line in enumerate(infile, 1): 87 | original_line = line # Keep original for logging errors 88 | line = line.strip() 89 | if not line or line.startswith('#'): # Skip empty lines and comments 90 | continue 91 | 92 | # Remove trailing slash if present 93 | line = line.rstrip('/') 94 | 95 | # Extract the part after the last slash (potential channel name) 96 | if '/' in line: 97 | # Takes the part after the last '/' 98 | channel_name = line.rsplit('/', 1)[-1] 99 | else: 100 | # Assume the whole line is the channel name if no slash 101 | channel_name = line 102 | 103 | # Basic validation: non-empty, reasonable characters (alphanumeric, underscore) 104 | # Avoid full URLs mistakenly treated as names 105 | # Telegram usernames are >= 5 chars, start with letter, contain letters, numbers, underscores 106 | if channel_name and re.match(r'^[a-zA-Z][a-zA-Z0-9_]{4,}$', channel_name): 107 | if channel_name not in channels: # Avoid duplicates 108 | channels.append(channel_name) 109 | else: 110 | log_callback(f"Skipping invalid or malformed channel entry on line {line_num}: '{original_line.strip()}' -> extracted '{channel_name}'", "WARN") 111 | 112 | log_callback(f"Loaded {len(channels)} unique, valid channel names from {file_path.name}.", "INFO") 113 | 114 | if not channels: 115 | log_callback("The channel list file is empty or contains no valid channel names.", "ERROR") 116 | raise ValueError(f"No valid channel names found in {file_path.name}.") 117 | 118 | return channels 119 | 120 | except OSError as e: 121 | log_callback(f"Error reading channel list file {channellist_file}: {e}", "ERROR") 122 | raise RuntimeError(f"Error reading channel list file: {e}") from e 123 | except Exception as e: 124 | log_callback(f"Unexpected error loading channels from {channellist_file}: {e}", "ERROR") 125 | raise RuntimeError(f"Unexpected error loading channels: {e}") from e --------------------------------------------------------------------------------