├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── setup.cfg ├── setup.py ├── tchan.py └── test_tchan.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *~ 3 | .*.sw? 4 | .DS_Store 5 | .directory 6 | .sass-cache 7 | .vscode/ 8 | data/* 9 | docker/data 10 | collected-static/ 11 | 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | cover/ 65 | 66 | # Translations 67 | *.mo 68 | *.pot 69 | 70 | # Django stuff: 71 | *.log 72 | local_settings.py 73 | db.sqlite3 74 | db.sqlite3-journal 75 | 76 | # Flask stuff: 77 | instance/ 78 | .webassets-cache 79 | 80 | # Scrapy stuff: 81 | .scrapy 82 | 83 | # Sphinx documentation 84 | docs/_build/ 85 | 86 | # PyBuilder 87 | .pybuilder/ 88 | target/ 89 | 90 | # Jupyter Notebook 91 | .ipynb_checkpoints 92 | 93 | # IPython 94 | profile_default/ 95 | ipython_config.py 96 | 97 | # pyenv 98 | # For a library or package, you might want to ignore these files since the code is 99 | # intended to run in multiple environments; otherwise, check them in: 100 | # .python-version 101 | 102 | # pipenv 103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 106 | # install all needed dependencies. 107 | #Pipfile.lock 108 | 109 | # poetry 110 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 111 | # This is especially recommended for binary packages to ensure reproducibility, and is more 112 | # commonly ignored for libraries. 113 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 114 | #poetry.lock 115 | 116 | # pdm 117 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 118 | #pdm.lock 119 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 120 | # in version control. 121 | # https://pdm.fming.dev/#use-with-ide 122 | .pdm.toml 123 | 124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 125 | __pypackages__/ 126 | 127 | # Celery stuff 128 | celerybeat-schedule 129 | celerybeat.pid 130 | 131 | # SageMath parsed files 132 | *.sage.py 133 | 134 | # Environments 135 | .env 136 | .venv 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | # PyCharm 163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 165 | # and can be added to the global gitignore or merged into this file. For a more nuclear 166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 167 | .idea/ 168 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | lint: 2 | autoflake --in-place --recursive --remove-unused-variables --remove-all-unused-imports . 3 | isort --skip migrations --skip wsgi --skip asgi --line-length 80 --multi-line VERTICAL_HANGING_INDENT --trailing-comma . 4 | black --exclude '(docker/|migrations/|config/settings\.py|manage\.py|\.direnv|\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|venv|\.svn|\.ipynb_checkpoints|_build|buck-out|build|dist|__pypackages__)' -l 80 . 5 | flake8 --config setup.cfg 6 | 7 | test: 8 | pytest 9 | 10 | test-v: 11 | pytest -vvv 12 | 13 | test-release: 14 | rm -rf build dist 15 | python setup.py sdist bdist_wheel 16 | twine check dist/* 17 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* 18 | 19 | release: 20 | rm -rf build dist 21 | python setup.py sdist bdist_wheel 22 | twine check dist/* 23 | twine upload dist/* 24 | 25 | .PHONY: lint test test-v test-release release 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tchan - Telegram Channel scraper 2 | 3 | Python library and command-line interface to scrape Telegram public channels. 4 | Since this scraper uses Telegram Channel Web preview, **it won't work** for: 5 | 6 | - Public channels with "Restrict saving content" option enabled 7 | - Private channels 8 | - Public Groups 9 | - Private Groups 10 | 11 | It's also not possible to retrieve comments, since they're made on a group. 12 | 13 | ## Installing 14 | 15 | 16 | ```shell 17 | pip install tchan # Python library only 18 | pip install tchan[cli] # Library + CLI 19 | ``` 20 | 21 | ## Using as a libray 22 | 23 | ```python 24 | from tchan import ChannelScraper 25 | 26 | scraper = ChannelScraper() 27 | for message in scraper.messages("tchantest"): 28 | print(f"New message ({message.type}) from {message.channel}:") 29 | print(f" id={message.id}") 30 | print(f" created_at={message.created_at.isoformat()}") 31 | print(f" text={message.text}") 32 | # TODO: add more parameters 33 | ``` 34 | 35 | ## Using as a command-line tool 36 | 37 | Scrape one or many channels and save all messages to `messages.csv`: 38 | 39 | ```shell 40 | tchan messages.csv channel1 [channel2 ... channelN] 41 | ``` 42 | 43 | ## Tests 44 | 45 | To run all tests, execute: 46 | 47 | ```shell 48 | make test # or just `pytest` 49 | ``` 50 | 51 | Make sure to install development requirements. 52 | 53 | Tests were made on a channel created for this task: 54 | [tchantest](https://t.me/tchantest). 55 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = tchan 3 | version = 0.1.4 4 | description = Scrape Telegram public channels (Python library + CLI) 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | url = https://github.com/PythonicCafe/tchan/ 8 | keywords = telegram scraping social-media 9 | author = Álvaro Justen 10 | author_email = alvarojusten@gmail.com 11 | license = GNU Lesser General Public License v3 (LGPLv3) 12 | classifiers = 13 | Intended Audience :: Developers 14 | License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3) 15 | Operating System :: OS Independent 16 | Programming Language :: Python 17 | Programming Language :: Python :: 3 18 | Programming Language :: Python :: 3 :: Only 19 | Programming Language :: Python :: 3.7 20 | Programming Language :: Python :: 3.8 21 | Programming Language :: Python :: 3.9 22 | Programming Language :: Python :: 3.10 23 | Programming Language :: Python :: 3.11 24 | Topic :: Internet :: WWW/HTTP 25 | Topic :: Internet :: WWW/HTTP :: Dynamic Content 26 | 27 | [options] 28 | include_package_data = true 29 | py_modules = tchan 30 | python_requires = >=3.7 31 | install_requires = 32 | lxml 33 | requests 34 | 35 | [options.extras_require] 36 | cli = 37 | loguru 38 | tqdm 39 | dev = 40 | autoflake 41 | black 42 | flake8 43 | ipython 44 | isort 45 | pytest 46 | twine 47 | wheel 48 | 49 | [options.packages.find] 50 | exclude = 51 | data* 52 | test* 53 | Makefile 54 | 55 | [options.entry_points] 56 | console_scripts = 57 | tchan = tchan:main 58 | 59 | [flake8] 60 | max-line-length = 80 61 | exclude = .tox,.git,*/migrations/*,*/static/CACHE/*,docs,node_modules,docker/data/* 62 | ignore=I001,I003,I004,E231,E501,W503 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /tchan.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | from dataclasses import asdict, dataclass 4 | from pathlib import Path 5 | from typing import List 6 | from urllib.parse import urljoin, urlparse 7 | 8 | import requests 9 | from lxml.html import document_fromstring 10 | 11 | 12 | __version__ = "0.1.4" 13 | REGEXP_BACKGROUND_IMAGE_URL = re.compile(r"background-image:url\('(.*)'\)") 14 | 15 | 16 | def extract_bg_img(style): 17 | url = REGEXP_BACKGROUND_IMAGE_URL.findall(style)[0] 18 | if url.startswith("//"): 19 | url = f"https:{url}" 20 | return url 21 | 22 | def convert_int(value): 23 | if value.endswith("M"): 24 | return int(float(value[:-1]) * 1_000_000) 25 | elif value.endswith("K"): 26 | return int(float(value[:-1]) * 1_000) 27 | else: 28 | return int(value) 29 | 30 | 31 | @dataclass 32 | class ChannelMessage: 33 | id: int 34 | created_at: datetime.datetime 35 | type: str 36 | channel: str 37 | edited: bool 38 | urls: List[str] 39 | author: str = None 40 | text: str = None 41 | views: int = None 42 | reply_to_id: int = None 43 | preview_url: str = None 44 | preview_image_url: str = None 45 | preview_site_name: str = None 46 | preview_title: str = None 47 | preview_description: str = None 48 | forwarded_author: str = None 49 | forwarded_author_url: str = None 50 | 51 | 52 | @dataclass 53 | class ChannelInfo: 54 | username: str 55 | title: str 56 | image_url: str 57 | description: str = None 58 | subscribers: int = None 59 | photos: int = None 60 | videos: int = None 61 | links: int = None 62 | 63 | 64 | def normalize_url(username_or_url): 65 | """Normalize username or URL to a channel canonical URL""" 66 | path = urlparse(username_or_url).path 67 | if path.startswith("t.me/"): 68 | path = path[4:] 69 | if path.startswith("/s/"): 70 | path = path[2:] 71 | if path.startswith("/"): 72 | path = path[1:] 73 | if path.startswith("@"): 74 | path = path[1:] 75 | return urljoin("https://t.me/s/", path.split("/")[0]) 76 | 77 | 78 | def extract_text(parts, delimiter="\n"): 79 | return delimiter.join( 80 | item.strip() for item in parts if item.strip() 81 | ).strip() 82 | 83 | 84 | def parse_info(tree): 85 | username = extract_text( 86 | tree.xpath( 87 | "//div[@class = 'tgme_channel_info_header_username']//text()" 88 | ), 89 | delimiter="", 90 | ) 91 | if username[0] == "@": 92 | username = username[1:] 93 | title_text = tree.xpath("//meta[@property = 'og:title']/@content") 94 | title = title_text[0] if title_text else None 95 | image_url_text = tree.xpath("//meta[@property = 'og:image']/@content") 96 | image_url = image_url_text[0] if image_url_text else None 97 | description_text = tree.xpath( 98 | "//meta[@property = 'og:description']/@content" 99 | ) 100 | description = description_text[0] if description_text else None 101 | counters = {} 102 | counters_div = tree.xpath("//div[@class = 'tgme_channel_info_counters']")[0] 103 | for counter_div in counters_div.xpath( 104 | ".//div[@class = 'tgme_channel_info_counter']" 105 | ): 106 | key = counter_div.xpath(".//span[@class = 'counter_type']/text()")[0] 107 | value = convert_int( 108 | counter_div.xpath(".//span[@class = 'counter_value']/text()")[0] 109 | ) 110 | counters[key] = value 111 | 112 | return ChannelInfo( 113 | username=username, 114 | title=title, 115 | image_url=image_url, 116 | description=description, 117 | subscribers=counters.get("subscriber"), 118 | photos=counters.get("photos"), 119 | videos=counters.get("videos"), 120 | links=counters.get("links"), 121 | ) 122 | 123 | 124 | def parse_messages(original_url, tree): 125 | "Retrieve messages from HTML tree" 126 | messages = tree.xpath("//div[contains(@class, 'tgme_widget_message_wrap')]") 127 | for message in reversed(messages): 128 | if message.xpath(".//div[contains(@class, 'tme_no_messages_found')]"): 129 | # XXX: this case may happen because a great number of requests was 130 | # made and Telegram sent this response as if there were no new 131 | # posts when actually there are. 132 | return 133 | channel, id_ = message.xpath(".//div/@data-post")[0].split("/") 134 | created_at = datetime.datetime.fromisoformat( 135 | message.xpath(".//time/@datetime")[0] 136 | ) 137 | edited_text = message.xpath( 138 | ".//span[@class = 'tgme_widget_message_meta']/text()" 139 | ) 140 | edited = "edited" in edited_text[0].strip() if edited_text else False 141 | author_text = message.xpath( 142 | ".//span[@class = 'tgme_widget_message_from_author']/text()" 143 | ) 144 | author = author_text[0] if author_text else None 145 | text, views, type_, reply_to_id, urls = None, None, None, None, [] 146 | forwarded_author, forwarded_author_url = None, None 147 | ( 148 | preview_url, 149 | preview_image_url, 150 | preview_site_name, 151 | preview_title, 152 | preview_description, 153 | ) = (None, None, None, None, None) 154 | text_div_list = message.xpath( 155 | ".//div[contains(@class, 'tgme_widget_message_text')]" 156 | ) 157 | text_div = text_div_list[0] if text_div_list else None 158 | if message.xpath(".//div[contains(@class, 'service_message')]"): 159 | text = extract_text(text_div.xpath(".//text()"), delimiter="") 160 | type_ = "service" 161 | image_url_text = message.xpath( 162 | ".//a[@class = 'tgme_widget_message_service_photo']/img/@src" 163 | ) 164 | if image_url_text: 165 | urls.append(("photo", urljoin(original_url, image_url_text[0]))) 166 | 167 | else: 168 | views_text = extract_text( 169 | message.xpath( 170 | ".//span[contains(@class, 'tgme_widget_message_views')]//text()" 171 | ), 172 | delimiter="", 173 | ) 174 | if views_text: 175 | views = convert_int(views_text) 176 | if text_div is not None: 177 | text = extract_text(text_div.xpath(".//text()"), delimiter="\n") 178 | emoji_style_text = text_div.xpath( 179 | ".//i[@class = 'emoji']/@style" 180 | ) 181 | if emoji_style_text: 182 | urls.append( 183 | ( 184 | "photo", 185 | urljoin( 186 | original_url, 187 | extract_bg_img(emoji_style_text[0]), 188 | ), 189 | ) 190 | ) 191 | else: 192 | sticker_div_list = message.xpath( 193 | ".//div[contains(@class, 'tgme_widget_message_sticker_wrap')]//i[contains(@class, 'tgme_widget_message_sticker')]/@data-webp" 194 | ) 195 | if sticker_div_list: 196 | # TODO: add option to get sticker data from: 197 | # message.xpath(".//i[contains(@class, 'tgme_widget_message_sticker')]/@style")[0] 198 | type_ = "sticker" 199 | urls.append( 200 | ("photo", urljoin(original_url, sticker_div_list[0])) 201 | ) 202 | 203 | location_a_list = message.xpath( 204 | ".//a[@class = 'tgme_widget_message_location_wrap']/@href" 205 | ) 206 | if location_a_list: 207 | type_ = "location" 208 | urls.append( 209 | ("link", urljoin(original_url, location_a_list[0])) 210 | ) 211 | 212 | audio_src_list = message.xpath(".//audio/@src") 213 | if audio_src_list: 214 | # TODO: add duration to dataclass? 215 | # duration = extract_text( 216 | # message.xpath( 217 | # ".//time[contains(@class, 'tgme_widget_message_voice_duration')]/text()" 218 | # )[0], 219 | # delimiter="", 220 | # ) 221 | type_ = "audio" 222 | urls.append( 223 | ("audio", urljoin(original_url, audio_src_list[0])) 224 | ) 225 | 226 | document_class_list = message.xpath( 227 | ".//div[contains(@class, 'tgme_widget_message_document')]/@class" 228 | ) 229 | if document_class_list: 230 | # TODO: get title, document type and other info 231 | type_ = "document" 232 | 233 | poll_div_list = message.xpath( 234 | ".//div[contains(@class, 'tgme_widget_message_poll')]" 235 | ) 236 | if poll_div_list: 237 | # TODO: get other info 238 | type_ = "poll" 239 | 240 | photos_div_list = message.xpath( 241 | ".//a[contains(@class, 'tgme_widget_message_photo_wrap')]/@style" 242 | ) 243 | if photos_div_list: 244 | urls.extend( 245 | [ 246 | ("photo", urljoin(original_url, extract_bg_img(style))) 247 | for style in photos_div_list 248 | ] 249 | ) 250 | type_ = "photo" if type_ is None else "multimedia" 251 | 252 | roundvideos_div_list = message.xpath( 253 | ".//video[contains(@class, 'tgme_widget_message_roundvideo')]/@src" 254 | ) 255 | if roundvideos_div_list: 256 | # TODO: get video duration? 257 | urls.extend( 258 | [ 259 | ("round-video", urljoin(original_url, url)) 260 | for url in roundvideos_div_list 261 | ] 262 | ) 263 | type_ = "round-video" if type_ is None else "multimedia" 264 | 265 | video_link_list = message.xpath( 266 | "//a[contains(@class, 'tgme_widget_message_video_player')]" 267 | ) 268 | if video_link_list: 269 | type_ = "video" if type_ is None else "multimedia" 270 | videos_div_list = message.xpath( 271 | ".//div[contains(@class, 'tgme_widget_message_video_wrap')]//video[contains(@class, 'tgme_widget_message_video')]/@src" 272 | ) 273 | if videos_div_list: 274 | # TODO: get video duration? 275 | urls.extend( 276 | [ 277 | ("video", urljoin(original_url, url)) 278 | for url in videos_div_list 279 | ] 280 | ) 281 | 282 | reply_list = message.xpath( 283 | ".//a[contains(@class, 'tgme_widget_message_reply')]/@href" 284 | ) 285 | if reply_list: 286 | reply_to_id = int(reply_list[0].split("/")[-1]) 287 | 288 | a_preview_list = message.xpath( 289 | ".//a[contains(@class, 'tgme_widget_message_link_preview')]" 290 | ) 291 | if a_preview_list: 292 | a_tag = a_preview_list[0] 293 | url_preview = a_tag.xpath("./@href") 294 | preview_url = url_preview[0] if url_preview else None 295 | image_preview = a_tag.xpath( 296 | ".//i[contains(@class, 'link_preview_')]/@style" 297 | ) 298 | preview_image_url = ( 299 | extract_bg_img(image_preview[0]) if image_preview else None 300 | ) 301 | preview_site_name = ( 302 | extract_text( 303 | a_tag.xpath( 304 | ".//div[contains(@class, 'link_preview_site_name')]//text()" 305 | ) 306 | ) 307 | or None 308 | ) 309 | preview_title = ( 310 | extract_text( 311 | a_tag.xpath( 312 | ".//div[contains(@class, 'link_preview_title')]//text()" 313 | ) 314 | ) 315 | or None 316 | ) 317 | preview_description = ( 318 | extract_text( 319 | a_tag.xpath( 320 | ".//div[contains(@class, 'link_preview_description')]//text()" 321 | ) 322 | ) 323 | or None 324 | ) 325 | 326 | if text_div is not None: 327 | # TODO: parse spoilers? 328 | # TODO: how to know for which text the link is? 329 | if link_list := text_div.xpath(".//a/@href"): 330 | urls.extend( 331 | [ 332 | ("link", urljoin(original_url, url)) 333 | for url in link_list 334 | ] 335 | ) 336 | 337 | a_fwd_list = message.xpath( 338 | ".//a[contains(@class, 'tgme_widget_message_forwarded_from_name')]" 339 | ) 340 | if a_fwd_list: 341 | forwarded_author = extract_text( 342 | a_fwd_list[0].xpath(".//text()") 343 | ) 344 | forwarded_author_url = a_fwd_list[0].xpath("./@href")[0] 345 | 346 | if type_ is None: 347 | type_ = "text" 348 | 349 | for thumb_type in ("reply", "video", "roundvideo"): 350 | query = f".//i[contains(@class, 'tgme_widget_message_{thumb_type}_thumb')]/@style" 351 | urls.extend( 352 | [ 353 | ( 354 | f"thumbnail-{thumb_type}", 355 | urljoin(original_url, extract_bg_img(style)), 356 | ) 357 | for style in message.xpath(query) 358 | ] 359 | ) 360 | 361 | # TODO: parse live location 362 | # TODO: parse poll 363 | # TODO: parse document/audio 364 | # TODO: parse document/other 365 | yield ChannelMessage( 366 | id=int(id_), 367 | created_at=created_at, 368 | type=type_, 369 | channel=channel, 370 | author=author, 371 | edited=edited, 372 | text=text, 373 | views=views, 374 | urls=urls, 375 | reply_to_id=reply_to_id, 376 | preview_url=preview_url, 377 | preview_image_url=preview_image_url, 378 | preview_site_name=preview_site_name, 379 | preview_title=preview_title, 380 | preview_description=preview_description, 381 | forwarded_author=forwarded_author, 382 | forwarded_author_url=forwarded_author_url, 383 | ) 384 | 385 | 386 | class ChannelScraper: 387 | def __init__(self, user_agent=f"tchan/{__version__}"): 388 | self.session = requests.Session() 389 | self.session.headers["User-Agent"] = user_agent 390 | 391 | def info(self, username_or_url): 392 | url = normalize_url(username_or_url) 393 | response = self.session.get(url) 394 | tree = document_fromstring(response.text) 395 | return parse_info(tree) 396 | 397 | def messages(self, username_or_url): 398 | "Get messages from a channel, paginating until it ends" 399 | url = normalize_url(username_or_url) 400 | 401 | last_captured_id = None 402 | while True: 403 | response = self.session.get(url) 404 | tree = document_fromstring(response.text) 405 | for message in parse_messages(url, tree): 406 | last_captured_id = message.id 407 | yield message 408 | next_page_url = tree.xpath("//link[@rel = 'prev']/@href") 409 | if not next_page_url: 410 | if last_captured_id is not None and message.id > 20: 411 | # Telegram did not respond correctly, try again 412 | url = ( 413 | normalize_url(username_or_url) 414 | + f"?before={last_captured_id}" 415 | ) 416 | continue 417 | break 418 | url = urljoin(url, next_page_url[0]) 419 | 420 | def main(): 421 | import argparse 422 | import csv 423 | import json 424 | from pathlib import Path 425 | 426 | try: 427 | from loguru import logger 428 | from tqdm import tqdm 429 | except ImportError: 430 | print("Error - you muse install CLI dependencies with:") 431 | print(" pip install tchan[cli]") 432 | exit(1) 433 | 434 | parser = argparse.ArgumentParser() 435 | parser.add_argument("csv_filename") 436 | parser.add_argument("username_or_url", nargs="+") 437 | args = parser.parse_args() 438 | # TODO: add option to limit messages (--max=N, --until=datetime, --after=datetime etc.) 439 | # TODO: implement `urls_format`: postgres_array, json, multiline 440 | usernames_or_urls = args.username_or_url 441 | filename = Path(args.csv_filename) 442 | if not filename.parent.exists(): 443 | filename.parent.mkdir(parents=True) 444 | 445 | scraper = ChannelScraper() 446 | with filename.open(mode="w") as fobj: 447 | progress = tqdm(unit=" posts", unit_scale=True, dynamic_ncols=True) 448 | scrape_count, writer = 0, None 449 | for username_or_url in usernames_or_urls: 450 | username = normalize_url(username_or_url).replace( 451 | "https://t.me/s/", "" 452 | ) 453 | progress.desc = f"Scraping {username}" 454 | try: 455 | for message in scraper.messages(username): 456 | message = asdict(message) 457 | message["urls"] = json.dumps(message["urls"]) 458 | if writer is None: 459 | writer = csv.DictWriter( 460 | fobj, fieldnames=list(message.keys()) 461 | ) 462 | writer.writeheader() 463 | writer.writerow(message) 464 | progress.update() 465 | 466 | except StopIteration: # Group, bot or invalid username 467 | logger.error( 468 | "Invalid username or not a public channel: {username}" 469 | ) 470 | continue 471 | else: 472 | scrape_count += 1 473 | progress.desc = ( 474 | f"Scraped {scrape_count} user{'s' if scrape_count > 1 else ''}" 475 | ) 476 | progress.close() 477 | 478 | 479 | if __name__ == "__main__": 480 | main() 481 | -------------------------------------------------------------------------------- /test_tchan.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from lxml.html import document_fromstring 4 | 5 | from tchan import ( 6 | ChannelInfo, 7 | ChannelMessage, 8 | normalize_url, 9 | parse_info, 10 | parse_messages, 11 | ) 12 | 13 | original_url = "https://t.me/s/tchantest" 14 | 15 | 16 | def test_normalize_url(): 17 | assert normalize_url("https://t.me/fulano") == "https://t.me/s/fulano" 18 | assert normalize_url("https://t.me/s/fulano") == "https://t.me/s/fulano" 19 | assert normalize_url("https://t.me/fulano/12345") == "https://t.me/s/fulano" 20 | assert ( 21 | normalize_url("https://t.me/s/fulano/12345") == "https://t.me/s/fulano" 22 | ) 23 | assert normalize_url("t.me/fulano") == "https://t.me/s/fulano" 24 | assert normalize_url("t.me/s/fulano") == "https://t.me/s/fulano" 25 | assert normalize_url("t.me/s/fulano/12345") == "https://t.me/s/fulano" 26 | assert normalize_url("fulano") == "https://t.me/s/fulano" 27 | assert normalize_url("@fulano") == "https://t.me/s/fulano" 28 | 29 | 30 | def test_channel_info(): 31 | html = """ 32 | [...] 33 | 34 | 35 | 36 | 37 | [...] 38 |
@tchantest
39 | [...] 40 |
1 subscriber
3 photos
2 videos
4 links
41 | [...] 42 | """ 43 | tree = document_fromstring(html) 44 | result = parse_info(tree) 45 | expected = ChannelInfo( 46 | username="tchantest", 47 | title="tchan's test channel 👍", 48 | image_url="https://cdn1.telegram-cdn.org/file/pEJs58u1vQ4-YvOJ-6t1MAIcTPNIusLkfFzACh2CHzG-IOGGZVSKNsNIJhO-bkTdyAIabgzH7RqJBEjPLDWkJT7IYoQeCiDehrk1-KNRuXEgbCHMWDSxMuc9mOp-w3TJkfzLserjAsgwqVKE4fb0NouctjkVJHMcPkwxUVdoiEwEc6cUPP16fYQJfxKELtbBrfPpEha6Bdvfrhy2-6Sn3PPUx_krgiNduHJXXhc8zRcJt-YoOmX_McGV7EqZhEtDZHhRB2r441l4OJQzHjP7L-cA_y6g8cI1_hU7E8oLJJCoEdzHDrR2_z23MzjbHQ4F538BnqPEINvYBGJZP3h6Hg.jpg", 49 | description="Test channel for tchan Python library/CLI", 50 | subscribers=1, 51 | photos=3, 52 | videos=2, 53 | links=4, 54 | ) 55 | assert result == expected 56 | 57 | 58 | def test_parse_service_message_channel_created(): 59 | html = """ 60 |
61 |
62 |
63 | 64 |
65 |
66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 78 |
Channel created 79 |
80 | 85 |
86 |
87 |
88 | """ 89 | tree = document_fromstring(html) 90 | result = list(parse_messages(original_url, tree)) 91 | expected = ChannelMessage( 92 | id=1, 93 | created_at=datetime.datetime( 94 | 2023, 2, 24, 7, 26, 49, tzinfo=datetime.timezone.utc 95 | ), 96 | type="service", 97 | channel="tchantest", 98 | urls=[], 99 | author=None, 100 | edited=False, 101 | text="Channel created", 102 | views=None, 103 | ) 104 | assert result[0] == expected 105 | 106 | 107 | def test_parse_service_message_pinned(): 108 | html = """ 109 |
110 |
111 |
112 |
113 |
114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 126 |
tchan's test channel 👍 pinned «Going to pin this message» 127 |
128 | 133 |
134 |
135 |
136 | """ 137 | tree = document_fromstring(html) 138 | result = list(parse_messages(original_url, tree)) 139 | expected = ChannelMessage( 140 | id=92, 141 | created_at=datetime.datetime( 142 | 2023, 2, 24, 12, 20, 19, tzinfo=datetime.timezone.utc 143 | ), 144 | type="service", 145 | channel="tchantest", 146 | urls=[], 147 | author=None, 148 | edited=False, 149 | text="tchan's test channel👍pinned «Going to pin this message»", 150 | views=None, 151 | ) 152 | assert result[0] == expected 153 | 154 | 155 | def test_parse_multimedia_message(): 156 | html = """ 157 |
158 | 232 |
233 | """ 234 | tree = document_fromstring(html) 235 | result = list(parse_messages(original_url, tree)) 236 | expected = ChannelMessage( 237 | id=84, 238 | created_at=datetime.datetime( 239 | 2023, 2, 24, 11, 1, 46, tzinfo=datetime.timezone.utc 240 | ), 241 | type="multimedia", 242 | channel="tchantest", 243 | author=None, 244 | urls=[ 245 | ( 246 | "photo", 247 | "https://cdn1.telegram-cdn.org/file/CrsaR3dLCwSaTunI7l4nSUu5G7du7049yHXuZwbiOjAfSDEMrJaKiJ9-ly6RJOJf7wSYPGjpUppSBuqLkTbMIl_CMEpS_9nVLvNusCJTRXbhJbU4UFsTxiM89YyDE_9bapVEjoS9vhRS7qw1zSCbV2K42W3TZvvQ8scfiI2xiMIsRkw-YzpIVxbkzpeWz3-US6fi7DswlIobEgCG0uxmHdr6q2FEFOn9BCpfQHlrDqq8rCA9kBteMinAEkALObzktjJ76PMFWQZbQCcKIofW9oOThEJRAdFrRaho9PwBOQIcrSf_2MQmyqg8zA79k04ME76FsNNw3xr7xA160MHckA.jpg", 248 | ), 249 | ( 250 | "photo", 251 | "https://cdn1.telegram-cdn.org/file/QMvda1Z9IET0DA4-jsfevuguTxnTpn6omRRA7gfRfYs6VCmxjsZdeX4R6k4n7rYe7skLclk-A1G7Dw4UuYzV5Ogj9KkYlLSsJD2x_WQDRqmUDauUWEAGa-JNACCFBH2zRJeFwb6OvAD6itpb05MbvyMVvsR18sw8Qe4VcPvODZmfKGxQ7ioslSsxTKoLz8KEAIcvPqVLkJKjkPVERS6u8QBSS-ZPOWu6RvJsS0_fS7oCTSfLjTmn3EfGlU3BvRAmNcVCQv7Jz-CPNBgXHUk5Bru05WNpsdaoCVfX9PzeDycLIoHm5S55H_TV9zocKvg_ZX0NVyLE_Em_wO7nbDwDRQ.jpg", 252 | ), 253 | ( 254 | "video", 255 | "https://cdn1.telegram-cdn.org/file/bd7ceb0b41.mp4?token=FqRn4LGTlVsYQNCiPqd19TF0Z-EK6gWw__1F7os3qthEAdi8kkkrrWeUqJYCrNsv7OF80hlKUHUfKi_9PMsFbg28A8zONLdULTjmt256uaIBQqc3ieY_Js1kg8ezAIi5Y2tab_Z9dycZBrPQjARRoTVwLd9v_QrIQ7il3W1iXipnClgLZK4PxFjFOIhdYmEM_gAhxylo30BdpQFekkk5xCwNNTKmFjeaZTUQ_NNM1k3NGuKQo3SmrHUj4hK1xoKayOZxvB0oJIffTpMkHEBjs_j-tyi72uKmRz7my8ogBY0pHO-2Em5lDh-Li0rDGfHJ_NUssmwGqoToH-el_FZfZWA7juyZs_QNSkWABs0ijiVXDKZPKZKiOVQ0Uh8RtNs4UrNKp91GAVLLCb-C9TpW9SRlgMECG4yKWnXZ7nIsh_nhRNMk-aL0_Lehl0D4bRi1xMkXqtCu5B9pMS7MhIoNbg1sK7_4SJi3YRLugH_jiOGx3HhOP9HtBQZZTBZffb_d3vQQHNfXWEYUwP3t8iBwDrFL0qPwlT_1xMM57S6JlS3ApAEqqYFRxD6YBBFcJr3HMb2rinmL77iwCPtOirSeXFNaG0daSIaUn2jOMoDVH-MwVMBr6Oz_ESOWlqiGtHngUxKxgsbC9UxgvzcXXzHNelvl0z8isMf6oz0QqAntnWA", 256 | ), 257 | ( 258 | "video", 259 | "https://cdn1.telegram-cdn.org/file/e996ca12fa.mp4?token=KO0u7GXRUAGbO8QgNdwm-ZedWF5dNzcqm4VeQk_2XikjDahyMnyWKup0S9kyPO4piQqxDlK0yDsFa-myEr1LQPNnJdd1KqYUTiUGWgwI9d-9cA9d-J1U8mtiWDDiaLctgP73nFHrVNbpBopQELyGobP5ha5ofRzEC494a6QHkcKAakFRWlkMu2u2n_HAQZQhurOVvmJBW0pA_yMIv_lrVsHfmvjw-jGwd_dnou5l-158l_0i21I8jzjBMJ4bam9ayHgn3iEjw0uDcmmb-I9-i7Nz1vPQjRJs34_Qjyp6vDrawOGsOY0sAWT7r4lSpefK1Rdc0XQPwQgMY_izO_QfWw", 260 | ), 261 | ( 262 | "thumbnail-video", 263 | "https://cdn1.telegram-cdn.org/file/nWyXAzTHX19MAA8oKaS4tcW-afBU7oOvCpQfVX-Z8l5dVuBeBUDZ5wVn0LRN5OIkwxbDAoSVuJiVm8BPt__9PML4K9rBmVcinjrLUtbNcqhx1WF-JpR_jd5FGQXQStSPYwgYyf460aoQsrVu48r-h4UGyuRLcnNoXytG-siQ_kOxpMArdeZXIQRzNW9eWHfO-IaIx78jqLdhcUaueZjkix91Ak35tPDdloQ7vhzDGqtpJQ1bI6YWVVP0gAm7AQdqcmkoNfFwXjh4uvxEAvsJxdSfMNJ3A1uKhqkm-_fVcbUBgBcJoPIjBBpJx5A3yorSMoMFCvne1QIvFdfjNbGZbg", 264 | ), 265 | ( 266 | "thumbnail-video", 267 | "https://cdn1.telegram-cdn.org/file/jFd1zyUf89Ze7MO2jtuFOuJXVJrYf9-QNmuE_f0x07Nwkx6KgVXhq71eAMiuTYhg1T9lmLA74NfOWgLwLaNk5H4OZYUtQBGrRrGeFaOcnRDcv9jOb23ZAjj6BHMDJ3bfFh_lmsAKQzQIuhJesbi3kBeioa4BeVYW4qjRYUmoKuRYpH6kr2eAalOQ_IYV8p0RxqGbhrJO3vKSYhwodxb3lYI-RKLDUrhSGQUy43hKEt1Epb0rwnpXnXsHcTrbo96O-bqfZ20A_wPqgzMyA13Xa7guojvXlRD2bpd-ezUYdwh9yBb636oqfkvTkaEI8rH4azIJxSB5eVlug5cN2wk97g", 268 | ), 269 | ], 270 | edited=False, 271 | text="Multiple videos and pictures", 272 | views=436_600, 273 | ) 274 | assert result[0] == expected 275 | 276 | 277 | def test_parse_service_message_channel_video_changed(): 278 | html = """ 279 |
280 |
281 |
282 |
283 |
284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 296 |
+ "Channel video changed" 297 |
298 | 303 |
304 |
305 |
306 | """ 307 | tree = document_fromstring(html) 308 | result = list(parse_messages(original_url, tree)) 309 | expected = ChannelMessage( 310 | id=82, 311 | created_at=datetime.datetime( 312 | 2023, 2, 24, 10, 15, 33, tzinfo=datetime.timezone.utc 313 | ), 314 | type="text", 315 | channel="tchantest", 316 | author=None, 317 | urls=[], 318 | edited=False, 319 | text='+ "Channel video changed"', 320 | views=3_100_000, 321 | ) 322 | assert result[0] == expected 323 | 324 | 325 | def test_parse_service_message_channel_name_changed(): 326 | html = """ 327 |
328 |
329 |
330 |
331 |
332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 344 |
Channel name was changed to «tchan's test channel» 345 |
346 | 351 |
352 |
353 |
354 | """ 355 | tree = document_fromstring(html) 356 | result = list(parse_messages(original_url, tree)) 357 | expected = ChannelMessage( 358 | id=2, 359 | created_at=datetime.datetime( 360 | 2023, 2, 24, 7, 28, 1, tzinfo=datetime.timezone.utc 361 | ), 362 | type="service", 363 | channel="tchantest", 364 | author=None, 365 | urls=[], 366 | edited=False, 367 | text="Channel name was changed to «tchan's test channel»", 368 | views=None, 369 | ) 370 | assert result[0] == expected 371 | 372 | 373 | def test_parse_service_message_channel_photo_updated(): 374 | html = """ 375 |
376 |
377 |
378 |
379 |
380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 392 | 393 |
Channel photo updated 394 |
395 | 400 |
401 |
402 |
403 | """ 404 | tree = document_fromstring(html) 405 | result = list(parse_messages(original_url, tree)) 406 | expected = ChannelMessage( 407 | id=3, 408 | created_at=datetime.datetime( 409 | 2023, 2, 24, 7, 29, 23, tzinfo=datetime.timezone.utc 410 | ), 411 | type="service", 412 | channel="tchantest", 413 | author=None, 414 | edited=False, 415 | text="Channel photo updated", 416 | views=None, 417 | urls=[ 418 | ( 419 | "photo", 420 | "https://cdn1.telegram-cdn.org/file/DL-kcK51w4o7tyr5QWQWK7YexgMdwiKIVYbDNbzB2qUtyk9uYfrKo0t19LY08bW4WTdmGpI9t0YQ2aU3RpsaWVk_4Q9QfjBIjaM894tj1r96LzJ8PGXOLkHd3w_KDciIw-AFmZBAKs5UIK6WU6PW1Nx1uh9e084u9rKJQtVu7EZLx1YCgxtx5R69qSKCamUbie0yqbaocYeevtymiMw6C_BeYwLZux6iMhoejvs6jyaQXiQLtm53xvAcqPKefzM0frCmDU1t5sllrHJD7L2iv52m9j27Kcyi-cu6detpDOwxdC2Be9CvsN4UXDOwvxiEl3TQSkrFKb06csVd85lEaQ.jpg", 421 | ), 422 | ], 423 | ) 424 | assert result[0] == expected 425 | 426 | 427 | def test_parse_text_message_multiline(): 428 | html = """ 429 |
430 |
431 |
432 |
433 |
434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 446 |
Bigger
text
message
number
34 447 |
448 | 453 |
454 |
455 |
456 | """ 457 | tree = document_fromstring(html) 458 | result = list(parse_messages(original_url, tree)) 459 | expected = ChannelMessage( 460 | id=62, 461 | created_at=datetime.datetime( 462 | 2023, 2, 24, 8, 1, 26, tzinfo=datetime.timezone.utc 463 | ), 464 | type="text", 465 | channel="tchantest", 466 | urls=[], 467 | author=None, 468 | edited=False, 469 | text="Bigger\ntext\nmessage\nnumber\n34", 470 | views=2, 471 | ) 472 | assert result[0] == expected 473 | 474 | 475 | def test_parse_forwarded_text_message(): 476 | html = """ 477 |
478 |
479 |
480 |
481 |
482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 494 |
Forwarded from Some user 495 |
496 |
;) 497 |
498 | 503 |
504 |
505 |
506 | """ 507 | tree = document_fromstring(html) 508 | result = list(parse_messages(original_url, tree)) 509 | expected = ChannelMessage( 510 | id=89, 511 | created_at=datetime.datetime( 512 | 2023, 2, 24, 12, 13, 31, tzinfo=datetime.timezone.utc 513 | ), 514 | type="text", 515 | channel="tchantest", 516 | urls=[], 517 | author=None, 518 | edited=False, 519 | text=";)", 520 | views=2, 521 | forwarded_author="Some user", 522 | forwarded_author_url="https://t.me/some_user", 523 | ) 524 | assert result[0] == expected 525 | 526 | 527 | def test_parse_text_message_signed_not_edited(): 528 | html = """ 529 |
530 |
531 |
532 |
533 |
534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 546 |
Signed and not edited 547 |
548 | 553 |
554 |
555 |
556 | """ 557 | tree = document_fromstring(html) 558 | result = list(parse_messages(original_url, tree)) 559 | expected = ChannelMessage( 560 | id=79, 561 | created_at=datetime.datetime( 562 | 2023, 2, 24, 8, 7, 51, tzinfo=datetime.timezone.utc 563 | ), 564 | type="text", 565 | channel="tchantest", 566 | urls=[], 567 | author="Álvaro Justen", 568 | edited=False, 569 | text="Signed and not edited", 570 | views=2, 571 | ) 572 | assert result[0] == expected 573 | 574 | 575 | def test_parse_text_message_link_no_preview(): 576 | html = """ 577 |
578 |
579 |
580 |
581 |
582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 594 |
This is a message with a link

https://brasil.io/ 595 |
596 | 601 |
602 |
603 |
604 | """ 605 | tree = document_fromstring(html) 606 | result = list(parse_messages(original_url, tree)) 607 | expected = ChannelMessage( 608 | id=24, 609 | created_at=datetime.datetime( 610 | 2023, 2, 24, 7, 48, 2, tzinfo=datetime.timezone.utc 611 | ), 612 | type="text", 613 | channel="tchantest", 614 | urls=[("link", "https://brasil.io/")], 615 | author=None, 616 | edited=True, 617 | text="This is a message with a link\nhttps://brasil.io/", 618 | views=2, 619 | ) 620 | assert result[0] == expected 621 | 622 | 623 | def test_parse_text_message_link_with_regular_preview(): 624 | html = """ 625 |
626 | 660 |
661 | """ 662 | tree = document_fromstring(html) 663 | result = list(parse_messages(original_url, tree)) 664 | expected = ChannelMessage( 665 | id=94, 666 | created_at=datetime.datetime( 667 | 2023, 2, 24, 13, 47, 38, tzinfo=datetime.timezone.utc 668 | ), 669 | type="text", 670 | channel="tchantest", 671 | urls=[ 672 | ( 673 | "link", 674 | "https://agenciabrasil.ebc.com.br/justica/noticia/2022-08/operacao-guardioes-do-bioma-apreende-239-toneladas-de-minerio", 675 | ) 676 | ], 677 | author=None, 678 | edited=False, 679 | text="https://agenciabrasil.ebc.com.br/justica/noticia/2022-08/operacao-guardioes-do-bioma-apreende-239-toneladas-de-minerio", 680 | views=1, 681 | preview_url="https://agenciabrasil.ebc.com.br/justica/noticia/2022-08/operacao-guardioes-do-bioma-apreende-239-toneladas-de-minerio", 682 | preview_image_url="https://cdn4.telegram-cdn.org/file/C2HB_eIeUYDV3yo1xut1cz6d06v6ITJW1fnuT7uihR5nIzUG9unDbq_cCbJag2TFdg7C_uJReq5lTcu9HZHI88el4u17YROLNW-rm4nLJCGc9d7L8Pfvkf4wLEx7pfY32k68VOXqg3XQ3Y0M1HgiEZyz9IIY9WvqImvvgwWG5f_czeIe8cC8h_X7JAkwUnoNsPlPf6qzqfV5QBswqQQF0PoRzYxd3L-uLAAreSvahHIFTnhLWQZCNQXxucYd9-Ct-w6voFkGtBkpF68Tx5i6QdTFbWp6WqR4LR7BeNTdTgeMGoZN1x46I_maRaeeHyqDRqD9cJLl08hMH6NLtMKbkw.jpg", 683 | preview_site_name="Agência Brasil", 684 | preview_title="Operação Guardiões do Bioma apreende 23,9 toneladas de minério", 685 | preview_description="Ação contra o garimpo ilegal em Terra Indígena Yanomami durou um mês e resultou na prisão de 25 pessoas, apreensão de aeronaves e munições e em 115 autos de infração.", 686 | ) 687 | assert result[0] == expected 688 | 689 | 690 | def test_parse_text_message_link_with_right_preview(): 691 | html = """ 692 |
693 |
694 |
695 |
696 |
697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 709 |
link with preview https://python.org/ 710 |
711 | 712 | 713 | 715 | 717 | 719 | 720 | 725 |
726 |
727 |
728 | """ 729 | tree = document_fromstring(html) 730 | result = list(parse_messages(original_url, tree)) 731 | expected = ChannelMessage( 732 | id=81, 733 | created_at=datetime.datetime( 734 | 2023, 2, 24, 8, 16, 18, tzinfo=datetime.timezone.utc 735 | ), 736 | type="text", 737 | channel="tchantest", 738 | urls=[("link", "https://python.org/")], 739 | author=None, 740 | edited=False, 741 | text="link with preview\nhttps://python.org/", 742 | views=2, 743 | preview_url="https://www.python.org/", 744 | preview_image_url="https://cdn4.telegram-cdn.org/file/mcgzW-avL5x2aBHAMw_8xb-MEiP5rCavDScU8vCkIIiYDgc202XMtQ4daRGRZVGU8uIHWwOyWa-Io-NeHkdrbj87eaHQCMgH6t6T4cVrW5GUwQDuFgQpE7-7XFXWc2I_ffYrhqgZUqHfdJNIMovjz7H1i-Gk45e-rlFKlpb1bUOaOd07ISTdr1OCUSAbs7z6oofThWpyE_2AxA5upuupuiocaeMINNxnwnJ_ate8S3gvnGMq81trLqLtcrUI9Dlo1Na4QemQPH7IOz-ra6DhlyiHm6fb_Q0pDOpLvmpI73jODW3H7QfBjp5htgN7dNMtxkGxw11-tCmVU6gRTIU-5w.jpg", 745 | preview_site_name="Python.org", 746 | preview_title="Welcome to Python.org", 747 | preview_description="The official home of the Python Programming Language", 748 | ) 749 | assert result[0] == expected 750 | 751 | 752 | def test_parse_unsigned_text_message(): 753 | html = """ 754 |
755 |
756 |
757 |
758 |
759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 772 |
Unsigned message 773 |
774 | 779 |
780 |
781 |
782 | """ 783 | tree = document_fromstring(html) 784 | result = list(parse_messages(original_url, tree)) 785 | expected = ChannelMessage( 786 | id=6, 787 | created_at=datetime.datetime( 788 | 2023, 2, 24, 7, 30, 39, tzinfo=datetime.timezone.utc 789 | ), 790 | type="text", 791 | channel="tchantest", 792 | urls=[], 793 | author=None, 794 | edited=False, 795 | text="Unsigned message", 796 | views=2, 797 | ) 798 | assert result[0] == expected 799 | 800 | 801 | def test_parse_emoji_message(): 802 | html = """ 803 |
804 |
805 |
806 |
807 |
808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 820 |
👍 821 |
822 | 827 |
828 |
829 |
830 | """ 831 | tree = document_fromstring(html) 832 | result = list(parse_messages(original_url, tree)) 833 | expected = ChannelMessage( 834 | id=7, 835 | created_at=datetime.datetime( 836 | 2023, 2, 24, 7, 30, 53, tzinfo=datetime.timezone.utc 837 | ), 838 | type="text", 839 | channel="tchantest", 840 | author=None, 841 | edited=False, 842 | urls=[("photo", "https://telegram.org/img/emoji/40/F09F918D.png")], 843 | text="👍", 844 | views=2, 845 | ) 846 | assert result[0] == expected 847 | 848 | 849 | def test_parse_sticker_message(): 850 | html = """ 851 |
852 |
853 |
854 |
855 |
856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 868 |
869 |
870 |
This media is not supported in your browser 871 |
872 | VIEW IN TELEGRAM 873 |
874 |
875 |
876 |
877 |
878 |
879 | 884 |
885 |
886 |
887 | """ 888 | tree = document_fromstring(html) 889 | result = list(parse_messages(original_url, tree)) 890 | expected = ChannelMessage( 891 | id=9, 892 | created_at=datetime.datetime( 893 | 2023, 2, 24, 7, 31, 6, tzinfo=datetime.timezone.utc 894 | ), 895 | type="sticker", 896 | channel="tchantest", 897 | author=None, 898 | edited=False, 899 | urls=[ 900 | ( 901 | "photo", 902 | "https://cdn1.telegram-cdn.org/file/5b5c6e1325.webp?token=chQuhI8SVanorZnNJ_PTvHtJR1UOPC_cIjPNCVXhhG40BqJ9cpBCgrQy0NazQTCWO7bG_6JyNI4mFboxXSTcZJvATVgKwRTEkzFzeVen9a5AaZV36NUk9AWXUWFOaAX6jY4fKMQ3Sq6hicdTU4OjX4SvrwX501-pRHzw7b-dXMPwymHqMNwE-eVpiFu827y32eSOulEDWMvg2LMpmsIpks0b7fXcO-V-JvGwDvMsjVRy82406A5zElMjdD6lgBXZy5Hg79AyyVMJOprENkM0DY0evphw3gmq5G7YreJ9EcWIPX7K9skVfukCFxCxqjlHxV6T4aFGwxlZEJywBw_7pg", 903 | ), 904 | ], 905 | text=None, 906 | views=2, 907 | ) 908 | assert result[0] == expected 909 | 910 | 911 | def test_parse_audio_document(): 912 | html = """ 913 |
914 |
915 |
916 |
917 |
918 | 919 | 920 | 921 | 922 | 923 | 924 | 925 | 926 | 927 | 928 | 930 | 931 |
932 |
933 |
934 |
AUD-20130329-WA0000 935 |
936 |
portadosfundos 937 |
938 |
939 |
940 |
Povo hebreu 941 |
942 | 947 |
948 |
949 |
950 | """ 951 | tree = document_fromstring(html) 952 | result = list(parse_messages(original_url, tree)) 953 | expected = ChannelMessage( 954 | id=12, 955 | created_at=datetime.datetime( 956 | 2023, 2, 24, 7, 33, 58, tzinfo=datetime.timezone.utc 957 | ), 958 | type="document", 959 | channel="tchantest", 960 | author=None, 961 | edited=False, 962 | urls=[], 963 | text="Povo hebreu", 964 | views=2, 965 | ) 966 | assert result[0] == expected 967 | 968 | 969 | def test_parse_text_reply_to_video(): 970 | html = """ 971 | 972 |
973 |
974 |
975 |
976 |
977 | 978 | 979 | 980 | 981 | 982 | 983 | 984 | 985 | 986 | 987 | 989 | 990 |
991 | tchan's test channel 👍 992 |
993 |
Video 994 |
995 |
996 |
Reply to a video (not recorded in telegram) 997 |
998 | 1003 |
1004 |
1005 |
1006 | """ 1007 | tree = document_fromstring(html) 1008 | result = list(parse_messages(original_url, tree)) 1009 | expected = ChannelMessage( 1010 | id=20, 1011 | created_at=datetime.datetime( 1012 | 2023, 2, 24, 7, 38, 31, tzinfo=datetime.timezone.utc 1013 | ), 1014 | type="text", 1015 | channel="tchantest", 1016 | author=None, 1017 | edited=False, 1018 | urls=[ 1019 | ( 1020 | "thumbnail-reply", 1021 | "https://cdn1.telegram-cdn.org/file/nWyXAzTHX19MAA8oKaS4tcW-afBU7oOvCpQfVX-Z8l5dVuBeBUDZ5wVn0LRN5OIkwxbDAoSVuJiVm8BPt__9PML4K9rBmVcinjrLUtbNcqhx1WF-JpR_jd5FGQXQStSPYwgYyf460aoQsrVu48r-h4UGyuRLcnNoXytG-siQ_kOxpMArdeZXIQRzNW9eWHfO-IaIx78jqLdhcUaueZjkix91Ak35tPDdloQ7vhzDGqtpJQ1bI6YWVVP0gAm7AQdqcmkoNfFwXjh4uvxEAvsJxdSfMNJ3A1uKhqkm-_fVcbUBgBcJoPIjBBpJx5A3yorSMoMFCvne1QIvFdfjNbGZbg", 1022 | ), 1023 | ], 1024 | text="Reply to a video (not recorded in telegram)", 1025 | views=2, 1026 | reply_to_id=19, 1027 | ) 1028 | assert result[0] == expected 1029 | 1030 | 1031 | def test_parse_poll(): 1032 | html = """ 1033 |
1034 |
1035 |
1036 |
1037 |
1038 | 1039 | 1040 | 1041 | 1042 | 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1050 | 1084 | 1089 |
1090 |
1091 |
1092 | """ 1093 | tree = document_fromstring(html) 1094 | result = list(parse_messages(original_url, tree)) 1095 | expected = ChannelMessage( 1096 | id=11, 1097 | created_at=datetime.datetime( 1098 | 2023, 2, 24, 7, 32, 23, tzinfo=datetime.timezone.utc 1099 | ), 1100 | type="poll", 1101 | channel="tchantest", 1102 | author=None, 1103 | edited=False, 1104 | urls=[], 1105 | text=None, 1106 | views=2, 1107 | ) 1108 | assert result[0] == expected 1109 | 1110 | 1111 | def test_parse_photo_single(): 1112 | html = """ 1113 |
1114 |
1115 |
1116 |
1117 |
1118 | 1119 | 1120 | 1121 | 1122 | 1123 | 1124 | 1125 | 1126 | 1127 | 1128 | 1130 | 1131 |
1132 |
1133 |
1134 |
Picture by telegram camera 1135 |
1136 | 1141 |
1142 |
1143 |
1144 | """ 1145 | tree = document_fromstring(html) 1146 | result = list(parse_messages(original_url, tree)) 1147 | expected = ChannelMessage( 1148 | id=16, 1149 | created_at=datetime.datetime( 1150 | 2023, 2, 24, 7, 35, 28, tzinfo=datetime.timezone.utc 1151 | ), 1152 | type="photo", 1153 | channel="tchantest", 1154 | author=None, 1155 | edited=False, 1156 | urls=[ 1157 | ( 1158 | "photo", 1159 | "https://cdn1.telegram-cdn.org/file/AqX5QJmpXiolNyLq3Aq8-4eqTHJtpueMZqcszNrmWGgUt4I4iaH-CPxmqR-QPdgdzVtE_rX8cpCOgeAOsN9Ais72d79W-56VIEOdCenSvm5YuK9nHh-faVhkQAnTnw3DtOobB6G3jbZRDvHdjhlxyojAwvNXWGSyfIzmaEPq9C_ut2VXo5gJk8ZOAUi5OfxIRf7UVSyNyXbfKXZPwDdok7uTLp1gTaLacKNdcHN0Wdo0NCj2phCO_Hhv5zOqvJkC_Ct-3d3vv_aa03gOWTXzGtq1sbrlynldi9zVtWn6TOsNBGzDGSWqcm7WUe1QbddpBeS7VQWqyih2fKmhKLr5wA.jpg", 1160 | ), 1161 | ], 1162 | text="Picture by telegram camera", 1163 | views=2, 1164 | ) 1165 | assert result[0] == expected 1166 | 1167 | 1168 | def test_parse_message_weird_preview(): 1169 | html = """ 1170 |
1171 |
1172 |
1173 |
1174 |
1175 | 1176 | 1177 | 1178 | 1179 | 1180 | 1181 | 1182 | 1183 | 1184 | 1185 | 1187 |
_

✍️ CÂMARA APROVA
_


🔸 Projeto que aumenta penas para crimes sexuais contra crianças
#Punição_ampliada
🗳 Opine sobre esta proposta clicando aqui!
____

🔸Projeto que facilita corte ou poda de árvore quando houver risco de acidente.
#Poda_de_árvores
🗳 Opine sobre esta proposta clicando aqui!
____

🔸Proposta que susta resoluções da Aneel sobre tarifas de transmissão.
#Tarifas_de_transmissão
🗳 Opine sobre esta proposta clicando aqui!


____

📬 Convide seus contatos do WhatsApp a conhecerem nosso canal clicando aqui!
____ 1188 |
1189 | 1190 | 1192 | 1194 | 1195 | 1200 |
1201 |
1202 |
1203 | """ 1204 | tree = document_fromstring(html) 1205 | result = list(parse_messages("https://t.me/s/CamaradosDeputados", tree)) 1206 | expected = ChannelMessage( 1207 | id=7334, 1208 | created_at=datetime.datetime( 1209 | 2022, 11, 9, 22, 43, 54, tzinfo=datetime.timezone.utc 1210 | ), 1211 | type="text", 1212 | channel="CamaradosDeputados", 1213 | author=None, 1214 | edited=True, 1215 | urls=[ 1216 | ("photo", "https://telegram.org/img/emoji/40/E29C8D.png"), 1217 | ("link", "http://bit.ly/3Tq1C4G"), 1218 | ( 1219 | "link", 1220 | "https://t.me/s/CamaradosDeputados?q=%23Puni%C3%A7%C3%A3o_ampliada", 1221 | ), 1222 | ("link", "http://bit.ly/3PCX1vn"), 1223 | ("link", "http://bit.ly/3tkdScB"), 1224 | ( 1225 | "link", 1226 | "https://t.me/s/CamaradosDeputados?q=%23Poda_de_%C3%A1rvores", 1227 | ), 1228 | ("link", "http://bit.ly/3A1c4c4"), 1229 | ("link", "http://bit.ly/3A3EoKS"), 1230 | ( 1231 | "link", 1232 | "https://t.me/s/CamaradosDeputados?q=%23Tarifas_de_transmiss%C3%A3o", 1233 | ), 1234 | ("link", "http://bit.ly/3zWxOpw"), 1235 | ("link", "https://bit.ly/2UBmLNC"), 1236 | ], 1237 | text="_\n✍️\nCÂMARA APROVA\n_\n🔸\nProjeto que aumenta penas para crimes sexuais contra crianças\n#Punição_ampliada\n🗳\nOpine sobre esta proposta\nclicando aqui!\n____\n🔸\nProjeto que facilita corte ou poda de árvore quando houver risco de acidente.\n#Poda_de_árvores\n🗳\nOpine sobre esta proposta\nclicando aqui!\n____\n🔸\nProposta que susta resoluções da Aneel sobre tarifas de transmissão\n.\n#Tarifas_de_transmissão\n🗳\nOpine sobre esta proposta\nclicando aqui!\n____\n📬\nConvide seus contatos do WhatsApp a conhecerem nosso canal\nclicando aqui!\n____", 1238 | views=1200, 1239 | preview_url="http://bit.ly/3Tq1C4G", 1240 | preview_image_url=None, 1241 | preview_site_name="Portal da Câmara dos Deputados", 1242 | preview_title="Câmara aprova projeto que aumenta penas para crimes sexuais contra crianças; acompanhe - Notícias", 1243 | preview_description=None, 1244 | ) 1245 | assert result[0] == expected 1246 | 1247 | 1248 | def test_parse_video_big(): 1249 | html = """ 1250 |
1251 |
1252 |
1253 |
1254 |
1255 | 1256 | 1257 | 1258 | 1259 | 1260 | 1261 | 1262 | 1263 | 1264 | 1265 |
Blah 1266 |
1267 | 1268 |
1269 |
1270 |
1271 |
1272 | 1273 |
1274 |
1275 |
Media is too big 1276 |
1277 | VIEW IN TELEGRAM 1278 |
1279 |
1280 |
some text
1281 | 1286 |
1287 |
1288 |
1289 | """ 1290 | tree = document_fromstring(html) 1291 | result = list(parse_messages(original_url, tree)) 1292 | expected = ChannelMessage( 1293 | id=5343, 1294 | created_at=datetime.datetime( 1295 | 2023, 1, 30, 23, 30, 40, tzinfo=datetime.timezone.utc 1296 | ), 1297 | type="video", 1298 | channel="some_random_user", 1299 | author=None, 1300 | edited=False, 1301 | urls=[ 1302 | ( 1303 | "thumbnail-video", 1304 | "https://cdn1.telegram-cdn.org/file/HyfobRUDoAxInyfbDVXk1q13pW97NtZN0TYuraxfsJLPps5R14DRXfT1DN2qZkAg4UIpu4RE0frU5LJLK3Y9oWyDgw3Y-Jg92EghLSe0Wmb6dMqCFwoz2CISl_hAeNgDksSQ5i_feURS-NzwXJRHeHwqz1funqsyNkdC5irHPSglylhsf3ZdEZMA4b1XqYrU3Zz4IgT3pLz0HcPSHcxHSyRAGjmb6vFbUYr-qThcvTD7HGXdb6gVRhV3bQyzo8xS9d2Vxho2j735Rxnr5qaprcef883AEuYuDSD31Anok_s6cvOz0jurCG8jbtonkSN9omsub5oyLrvT0H2ZQ4Gx3g", 1305 | ), 1306 | ], 1307 | text="some text", 1308 | views=436600, 1309 | ) 1310 | assert result[0] == expected 1311 | 1312 | 1313 | def test_parse_video_single(): 1314 | html = """ 1315 |
1316 |
1317 |
1318 |
1319 |
1320 | 1321 | 1322 | 1323 | 1324 | 1325 | 1326 | 1327 | 1328 | 1329 | 1330 | 1332 | 1333 | 1334 |
1335 | 1336 |
1337 |
1338 |
1339 | 1340 |
1341 |
1342 |
This media is not supported in your browser 1343 |
1344 | VIEW IN TELEGRAM 1345 |
1346 |
1347 |
Video by telegram camera 1348 |
1349 | 1354 |
1355 |
1356 |
1357 | """ 1358 | tree = document_fromstring(html) 1359 | result = list(parse_messages(original_url, tree)) 1360 | expected = ChannelMessage( 1361 | id=18, 1362 | created_at=datetime.datetime( 1363 | 2023, 2, 24, 7, 36, 49, tzinfo=datetime.timezone.utc 1364 | ), 1365 | type="video", 1366 | channel="tchantest", 1367 | author=None, 1368 | edited=False, 1369 | urls=[ 1370 | ( 1371 | "video", 1372 | "https://cdn1.telegram-cdn.org/file/61dc623809.mp4?token=K5rf-tYZuK-hwjpm2Aa8NLNZI54LnZSlUftpmFaAjXdqeUtO0xfykZx8zfupPs7TYNYHMTDKJcmfoDs66q2bTsAk0d1kwrwZrArAGYOWBnJmTDualrn2gIQ8_VPKEQccf_k_sGyzGcbsCs9rkdlOTWMUQXOxoKDQ12X2hdnPONoehz0KAR_nH-W0lGKGUuxbUhu8yFVZGuu9JLFSp5dLprKNy9HYAJeh8D7yM3lZ7GmIG3ck5GLdjeYzRFx96sn0NviOl8H7kHcoDOCv_CkppMpXWninC8yboJUsmPISjfwQOiDD9ZOE2wZb7TN0zYSAHDFEy0Ga1msxs6f1j7Wjyw", 1373 | ), 1374 | ( 1375 | "thumbnail-video", 1376 | "https://cdn1.telegram-cdn.org/file/WiZBMqAIrM0sjmV7tKxjJQ3BcyP04k0bU3i1_kCI_cipeYNV2EgXyUIZ8Jii-7BMgdpGH9HKFCY8NuzsnvpYToh3PJdSdd4aOwSrYvu8uGQgogharMY-8IjAdcFNQh0stJ7r1-3mjJCT-1SojXo1LOBCt-sX7PI9woHjvqPFDDJyv9-xNbEgwWYMuKUyCA6Z1aKsEAz0wIDzLil4IXze3_neSQRkSlqWUnCGV_JoPy-qQuJd_6do_AnJMaLwlDcFBYAzZ-sX1kJC03qWhtjHUy9uaG9j8z23C_RcSlhWPTYPQi5t0x1HQXRWm-kaASMrghIIg9HFSu4MdCLOb8IgIA", 1377 | ), 1378 | ], 1379 | text="Video by telegram camera", 1380 | views=2, 1381 | ) 1382 | assert result[0] == expected 1383 | 1384 | 1385 | def test_parse_video_single_2(): 1386 | html = """ 1387 |
1388 |
1389 |
1390 |
1391 |
1392 | 1393 | 1394 | 1395 | 1396 | 1397 | 1398 | 1399 | 1400 | 1401 | 1402 | 1404 | 1405 |
1406 | 1407 |
1408 |
1409 |
1410 | 1411 |
1412 |
1413 |
This media is not supported in your browser 1414 |
1415 | VIEW IN TELEGRAM 1416 |
1417 |
1418 | 1423 |
1424 |
1425 |
1426 | """ 1427 | tree = document_fromstring(html) 1428 | result = list(parse_messages(original_url, tree)) 1429 | expected = ChannelMessage( 1430 | id=19, 1431 | created_at=datetime.datetime( 1432 | 2023, 2, 24, 7, 38, 13, tzinfo=datetime.timezone.utc 1433 | ), 1434 | type="video", 1435 | channel="tchantest", 1436 | author=None, 1437 | edited=False, 1438 | urls=[ 1439 | ( 1440 | "video", 1441 | "https://cdn1.telegram-cdn.org/file/bd7ceb0b41.mp4?token=FqRn4LGTlVsYQNCiPqd19TF0Z-EK6gWw__1F7os3qthEAdi8kkkrrWeUqJYCrNsv7OF80hlKUHUfKi_9PMsFbg28A8zONLdULTjmt256uaIBQqc3ieY_Js1kg8ezAIi5Y2tab_Z9dycZBrPQjARRoTVwLd9v_QrIQ7il3W1iXipnClgLZK4PxFjFOIhdYmEM_gAhxylo30BdpQFekkk5xCwNNTKmFjeaZTUQ_NNM1k3NGuKQo3SmrHUj4hK1xoKayOZxvB0oJIffTpMkHEBjs_j-tyi72uKmRz7my8ogBY0pHO-2Em5lDh-Li0rDGfHJ_NUssmwGqoToH-el_FZfZWA7juyZs_QNSkWABs0ijiVXDKZPKZKiOVQ0Uh8RtNs4UrNKp91GAVLLCb-C9TpW9SRlgMECG4yKWnXZ7nIsh_nhRNMk-aL0_Lehl0D4bRi1xMkXqtCu5B9pMS7MhIoNbg1sK7_4SJi3YRLugH_jiOGx3HhOP9HtBQZZTBZffb_d3vQQHNfXWEYUwP3t8iBwDrFL0qPwlT_1xMM57S6JlS3ApAEqqYFRxD6YBBFcJr3HMb2rinmL77iwCPtOirSeXFNaG0daSIaUn2jOMoDVH-MwVMBr6Oz_ESOWlqiGtHngUxKxgsbC9UxgvzcXXzHNelvl0z8isMf6oz0QqAntnWA", 1442 | ), 1443 | ( 1444 | "thumbnail-video", 1445 | "https://cdn1.telegram-cdn.org/file/nWyXAzTHX19MAA8oKaS4tcW-afBU7oOvCpQfVX-Z8l5dVuBeBUDZ5wVn0LRN5OIkwxbDAoSVuJiVm8BPt__9PML4K9rBmVcinjrLUtbNcqhx1WF-JpR_jd5FGQXQStSPYwgYyf460aoQsrVu48r-h4UGyuRLcnNoXytG-siQ_kOxpMArdeZXIQRzNW9eWHfO-IaIx78jqLdhcUaueZjkix91Ak35tPDdloQ7vhzDGqtpJQ1bI6YWVVP0gAm7AQdqcmkoNfFwXjh4uvxEAvsJxdSfMNJ3A1uKhqkm-_fVcbUBgBcJoPIjBBpJx5A3yorSMoMFCvne1QIvFdfjNbGZbg", 1446 | ), 1447 | ], 1448 | text=None, 1449 | views=2, 1450 | ) 1451 | assert result[0] == expected 1452 | 1453 | 1454 | def test_parse_round_video_single(): 1455 | html = """ 1456 |
1457 |
1458 |
1459 |
1460 |
1461 | 1462 | 1463 | 1464 | 1465 | 1466 | 1467 | 1468 | 1469 | 1470 | 1471 | 1473 |
1474 |
1475 | 1476 | 1477 |
1478 |
1479 |
1480 |
1481 |
1482 |
1483 |
1484 |
This media is not supported in your browser 1485 |
1486 | VIEW IN TELEGRAM 1487 |
1488 |
1489 | 1490 | 1491 | 1492 |
1493 |
1494 | 1495 |
1496 | 1501 |
1502 |
1503 |
1504 | """ 1505 | tree = document_fromstring(html) 1506 | result = list(parse_messages(original_url, tree)) 1507 | expected = ChannelMessage( 1508 | id=17, 1509 | created_at=datetime.datetime( 1510 | 2023, 2, 24, 7, 35, 45, tzinfo=datetime.timezone.utc 1511 | ), 1512 | type="round-video", 1513 | channel="tchantest", 1514 | author=None, 1515 | edited=False, 1516 | urls=[ 1517 | ( 1518 | "round-video", 1519 | "https://cdn1.telegram-cdn.org/file/68145c80a1.mp4?token=Asgm3ihYDp5WkmfxRXtMximu8cGDQy4Y2UIeZ3JmSNpRz-oDSsXbORc8H4V4oaR4LkaIEFfPz82hrRD1Fvu2wNDljkcCVbxwa0D5iD_sfhWKTDK3Hfy1Fu1hQPZG0b4-1rEkNIfRJp8T_H0nW6Rej80Nl8I8xZAINfHAe0ibS6Qs5R4IudWG4ULL3NRmJLzGDM92YlqVlkEzwvLB9au1G0jczynEbJ7qn5npIxQYPPIBzZehLtZHAXqp_cnq6moGUL6mAfK1tOPqN13wVfizhtv2XHy8WqvaEVxZMTWFyMu64nPc1aB1EGsfvdgSdpV-pENMcsH5ihLEbwfCTbQPFw", 1520 | ), 1521 | ( 1522 | "thumbnail-roundvideo", 1523 | "https://cdn1.telegram-cdn.org/file/U9Z-cXN1H4Y4JdCGNLPCaw8Y_4idAGdFDGqOMSS20fsGBN0OWrzYo-rtvGIgex8IkoAGGjqz2DduF369l5kY_jGB1zd7NpE9rqgSnB1hhUVFmxWqhP3bzsyTjJRABWJns176vCa_Jn8CEjHFddu39ONkG4Hyqc7wGDA6eOOMFF2pATNxePd-Jg056jTn79In9byN7cKk5Rlkt2hkY02vlaee_eokRInwNRQShvJ59Xdv0gEQV27DpSbjFvc4Ci_66unLiu3aWWA50etg5CbB2nmtSfLSs7ujGBteh5Aw9z0HjB4BVw3eNbZVrDKBUTe_5eFw6XE4RKwRnJ8LL4TPuA", 1524 | ), 1525 | ], 1526 | text=None, 1527 | views=2, 1528 | ) 1529 | assert result[0] == expected 1530 | 1531 | 1532 | def test_parse_photo_multiple(): 1533 | html = """ 1534 |
1535 |
1536 |
1537 |
1538 |
1539 | 1540 | 1541 | 1542 | 1543 | 1544 | 1545 | 1546 | 1547 | 1548 | 1549 | 1551 | 1568 |
1569 |
Multiple pics 1570 |
1571 |
1572 | 1577 |
1578 |
1579 |
1580 | """ 1581 | tree = document_fromstring(html) 1582 | result = list(parse_messages(original_url, tree)) 1583 | expected = ChannelMessage( 1584 | id=14, 1585 | created_at=datetime.datetime( 1586 | 2023, 2, 24, 7, 34, 20, tzinfo=datetime.timezone.utc 1587 | ), 1588 | type="photo", 1589 | channel="tchantest", 1590 | author=None, 1591 | edited=False, 1592 | urls=[ 1593 | ( 1594 | "photo", 1595 | "https://cdn1.telegram-cdn.org/file/QMvda1Z9IET0DA4-jsfevuguTxnTpn6omRRA7gfRfYs6VCmxjsZdeX4R6k4n7rYe7skLclk-A1G7Dw4UuYzV5Ogj9KkYlLSsJD2x_WQDRqmUDauUWEAGa-JNACCFBH2zRJeFwb6OvAD6itpb05MbvyMVvsR18sw8Qe4VcPvODZmfKGxQ7ioslSsxTKoLz8KEAIcvPqVLkJKjkPVERS6u8QBSS-ZPOWu6RvJsS0_fS7oCTSfLjTmn3EfGlU3BvRAmNcVCQv7Jz-CPNBgXHUk5Bru05WNpsdaoCVfX9PzeDycLIoHm5S55H_TV9zocKvg_ZX0NVyLE_Em_wO7nbDwDRQ.jpg", 1596 | ), 1597 | ( 1598 | "photo", 1599 | "https://cdn1.telegram-cdn.org/file/CrsaR3dLCwSaTunI7l4nSUu5G7du7049yHXuZwbiOjAfSDEMrJaKiJ9-ly6RJOJf7wSYPGjpUppSBuqLkTbMIl_CMEpS_9nVLvNusCJTRXbhJbU4UFsTxiM89YyDE_9bapVEjoS9vhRS7qw1zSCbV2K42W3TZvvQ8scfiI2xiMIsRkw-YzpIVxbkzpeWz3-US6fi7DswlIobEgCG0uxmHdr6q2FEFOn9BCpfQHlrDqq8rCA9kBteMinAEkALObzktjJ76PMFWQZbQCcKIofW9oOThEJRAdFrRaho9PwBOQIcrSf_2MQmyqg8zA79k04ME76FsNNw3xr7xA160MHckA.jpg", 1600 | ), 1601 | ], 1602 | text="Multiple pics", 1603 | views=2, 1604 | ) 1605 | assert result[0] == expected 1606 | 1607 | 1608 | def test_parse_location_message(): 1609 | html = """ 1610 |
1611 |
1612 |
1613 |
1614 |
1615 | 1616 | 1617 | 1618 | 1619 | 1620 | 1621 | 1622 | 1623 | 1624 | 1625 | 1626 | 1628 | 1629 |
1630 |
1631 |
1632 | 1637 |
1638 |
1639 |
1640 | """ 1641 | tree = document_fromstring(html) 1642 | result = list(parse_messages(original_url, tree)) 1643 | expected = ChannelMessage( 1644 | id=10, 1645 | created_at=datetime.datetime( 1646 | 2023, 2, 24, 7, 31, 38, tzinfo=datetime.timezone.utc 1647 | ), 1648 | type="location", 1649 | channel="tchantest", 1650 | author=None, 1651 | edited=False, 1652 | urls=[ 1653 | ( 1654 | "link", 1655 | "https://maps.google.com/maps?q=-23.930587176407,-44.086305367818&ll=-23.930587176407,-44.086305367818&z=16", 1656 | ), 1657 | ], 1658 | text=None, 1659 | views=2, 1660 | ) 1661 | assert result[0] == expected 1662 | 1663 | 1664 | def test_parse_location_message_2(): 1665 | html = """ 1666 |
1667 |
1668 |
1669 |
1670 |
1671 | 1672 | 1673 | 1674 | 1675 | 1676 | 1677 | 1678 | 1679 | 1680 | 1681 | 1683 | 1684 |
1685 |
1686 |
1687 | 1692 |
1693 |
1694 |
1695 | """ 1696 | tree = document_fromstring(html) 1697 | result = list(parse_messages(original_url, tree)) 1698 | expected = ChannelMessage( 1699 | id=83, 1700 | created_at=datetime.datetime( 1701 | 2023, 2, 24, 11, 1, 22, tzinfo=datetime.timezone.utc 1702 | ), 1703 | type="location", 1704 | channel="tchantest", 1705 | author=None, 1706 | edited=False, 1707 | urls=[ 1708 | ( 1709 | "link", 1710 | "https://maps.google.com/maps?q=-23.532363086523,-46.689620092745&ll=-23.532363086523,-46.689620092745&z=16", 1711 | ), 1712 | ], 1713 | text=None, 1714 | views=1, 1715 | ) 1716 | assert result[0] == expected 1717 | 1718 | 1719 | def test_parse_location_message_3(): 1720 | html = """ 1721 |
1722 |
1723 |
1724 |
1725 |
1726 | 1727 | 1728 | 1729 | 1730 | 1731 | 1732 | 1733 | 1734 | 1735 | 1736 | 1738 | 1739 |
1740 |
1741 |
1742 | 1747 |
1748 |
1749 |
1750 | """ 1751 | tree = document_fromstring(html) 1752 | result = list(parse_messages(original_url, tree)) 1753 | expected = ChannelMessage( 1754 | id=88, 1755 | created_at=datetime.datetime( 1756 | 2023, 2, 24, 11, 23, 40, tzinfo=datetime.timezone.utc 1757 | ), 1758 | type="location", 1759 | channel="tchantest", 1760 | author=None, 1761 | edited=False, 1762 | urls=[ 1763 | ( 1764 | "link", 1765 | "https://maps.google.com/maps?q=-23.531972140424,-46.689076113848&ll=-23.531972140424,-46.689076113848&z=16", 1766 | ), 1767 | ], 1768 | text=None, 1769 | views=2, 1770 | ) 1771 | assert result[0] == expected 1772 | 1773 | 1774 | def test_parse_audio_message(): 1775 | html = """ 1776 |
1777 |
1778 |
1779 |
1780 |
1781 | 1782 | 1783 | 1784 | 1785 | 1786 | 1787 | 1788 | 1789 | 1790 | 1791 | 1793 | 1794 | 1795 |
1796 |
1797 |
1798 |
1799 |
1800 |
1801 |
1802 |
1803 |
1804 |
1805 |
1806 |
1807 |
1808 | 1809 |
1810 |
1811 | 1816 |
1817 |
1818 |
1819 | """ 1820 | tree = document_fromstring(html) 1821 | result = list(parse_messages(original_url, tree)) 1822 | expected = ChannelMessage( 1823 | id=13, 1824 | created_at=datetime.datetime( 1825 | 2023, 2, 24, 7, 34, 5, tzinfo=datetime.timezone.utc 1826 | ), 1827 | type="audio", 1828 | channel="tchantest", 1829 | author=None, 1830 | edited=False, 1831 | urls=[ 1832 | ( 1833 | "audio", 1834 | "https://cdn1.telegram-cdn.org/file/29881a3f30.ogg?token=RPh7-yUn9te932hRlzltMAmWTgmmAzD_PswWMJnmCZSfQQ63SfT5jfT_IpSq71gXP0d5F_G3fZLc1mxLuPR_NFSBVncAk7hvT5086hIQJfHX8qoE8VW-714sWoGTqVv6l35yS7V8_hCkt4KD2kW3F_5K2MPJ6yKPHBB4VzvlLmrwUoVVTPPOl2NJypUtDdN2fwFj7TkeZYHu0jrtKGrynmmBOp36SHpks7c9bkkL8HcaHhlzZXCBzibbWh0IM895baESbOimQrxnUwuTE9gv_VHGa_EMiwBD71p_NyHYPfpgNxGi1TPCVzJSO5QVPXDBZ3MSHoYI4COLZMt_4wwx1A", 1835 | ), 1836 | ], 1837 | text=None, 1838 | views=2, 1839 | ) 1840 | assert result[0] == expected 1841 | 1842 | 1843 | def test_parse_signed_edited_text_message(): 1844 | html = """ 1845 |
1846 |
1847 |
1848 |
1849 |
1850 | 1851 | 1852 | 1853 | 1854 | 1855 | 1856 | 1857 | 1858 | 1859 | 1860 | 1862 |
Hello! Signed and edited message 1863 |
1864 | 1869 |
1870 |
1871 |
1872 | """ 1873 | tree = document_fromstring(html) 1874 | result = list(parse_messages(original_url, tree)) 1875 | expected = ChannelMessage( 1876 | id=5, 1877 | created_at=datetime.datetime( 1878 | 2023, 2, 24, 7, 29, 57, tzinfo=datetime.timezone.utc 1879 | ), 1880 | type="text", 1881 | channel="tchantest", 1882 | urls=[], 1883 | author="Álvaro Justen", 1884 | edited=True, 1885 | text="Hello! Signed and edited message", 1886 | views=2, 1887 | ) 1888 | assert result[0] == expected 1889 | 1890 | 1891 | def test_parse_no_posts_found(): 1892 | html = """ 1893 |
1894 |
1895 |
1896 |
No posts found
1897 |
1898 |
1899 |
1900 | """ 1901 | tree = document_fromstring(html) 1902 | result = list(parse_messages(original_url, tree)) 1903 | expected = [] 1904 | assert result == expected 1905 | --------------------------------------------------------------------------------