├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── setup.cfg
├── setup.py
├── tchan.py
└── test_tchan.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *~
3 | .*.sw?
4 | .DS_Store
5 | .directory
6 | .sass-cache
7 | .vscode/
8 | data/*
9 | docker/data
10 | collected-static/
11 |
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | *$py.class
17 |
18 | # C extensions
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | wheels/
35 | share/python-wheels/
36 | *.egg-info/
37 | .installed.cfg
38 | *.egg
39 | MANIFEST
40 |
41 | # PyInstaller
42 | # Usually these files are written by a python script from a template
43 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
44 | *.manifest
45 | *.spec
46 |
47 | # Installer logs
48 | pip-log.txt
49 | pip-delete-this-directory.txt
50 |
51 | # Unit test / coverage reports
52 | htmlcov/
53 | .tox/
54 | .nox/
55 | .coverage
56 | .coverage.*
57 | .cache
58 | nosetests.xml
59 | coverage.xml
60 | *.cover
61 | *.py,cover
62 | .hypothesis/
63 | .pytest_cache/
64 | cover/
65 |
66 | # Translations
67 | *.mo
68 | *.pot
69 |
70 | # Django stuff:
71 | *.log
72 | local_settings.py
73 | db.sqlite3
74 | db.sqlite3-journal
75 |
76 | # Flask stuff:
77 | instance/
78 | .webassets-cache
79 |
80 | # Scrapy stuff:
81 | .scrapy
82 |
83 | # Sphinx documentation
84 | docs/_build/
85 |
86 | # PyBuilder
87 | .pybuilder/
88 | target/
89 |
90 | # Jupyter Notebook
91 | .ipynb_checkpoints
92 |
93 | # IPython
94 | profile_default/
95 | ipython_config.py
96 |
97 | # pyenv
98 | # For a library or package, you might want to ignore these files since the code is
99 | # intended to run in multiple environments; otherwise, check them in:
100 | # .python-version
101 |
102 | # pipenv
103 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
104 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
105 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
106 | # install all needed dependencies.
107 | #Pipfile.lock
108 |
109 | # poetry
110 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
111 | # This is especially recommended for binary packages to ensure reproducibility, and is more
112 | # commonly ignored for libraries.
113 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
114 | #poetry.lock
115 |
116 | # pdm
117 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118 | #pdm.lock
119 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
120 | # in version control.
121 | # https://pdm.fming.dev/#use-with-ide
122 | .pdm.toml
123 |
124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
125 | __pypackages__/
126 |
127 | # Celery stuff
128 | celerybeat-schedule
129 | celerybeat.pid
130 |
131 | # SageMath parsed files
132 | *.sage.py
133 |
134 | # Environments
135 | .env
136 | .venv
137 |
138 | # Spyder project settings
139 | .spyderproject
140 | .spyproject
141 |
142 | # Rope project settings
143 | .ropeproject
144 |
145 | # mkdocs documentation
146 | /site
147 |
148 | # mypy
149 | .mypy_cache/
150 | .dmypy.json
151 | dmypy.json
152 |
153 | # Pyre type checker
154 | .pyre/
155 |
156 | # pytype static type analyzer
157 | .pytype/
158 |
159 | # Cython debug symbols
160 | cython_debug/
161 |
162 | # PyCharm
163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165 | # and can be added to the global gitignore or merged into this file. For a more nuclear
166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
167 | .idea/
168 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | lint:
2 | autoflake --in-place --recursive --remove-unused-variables --remove-all-unused-imports .
3 | isort --skip migrations --skip wsgi --skip asgi --line-length 80 --multi-line VERTICAL_HANGING_INDENT --trailing-comma .
4 | black --exclude '(docker/|migrations/|config/settings\.py|manage\.py|\.direnv|\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|venv|\.svn|\.ipynb_checkpoints|_build|buck-out|build|dist|__pypackages__)' -l 80 .
5 | flake8 --config setup.cfg
6 |
7 | test:
8 | pytest
9 |
10 | test-v:
11 | pytest -vvv
12 |
13 | test-release:
14 | rm -rf build dist
15 | python setup.py sdist bdist_wheel
16 | twine check dist/*
17 | twine upload --repository-url https://test.pypi.org/legacy/ dist/*
18 |
19 | release:
20 | rm -rf build dist
21 | python setup.py sdist bdist_wheel
22 | twine check dist/*
23 | twine upload dist/*
24 |
25 | .PHONY: lint test test-v test-release release
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tchan - Telegram Channel scraper
2 |
3 | Python library and command-line interface to scrape Telegram public channels.
4 | Since this scraper uses Telegram Channel Web preview, **it won't work** for:
5 |
6 | - Public channels with "Restrict saving content" option enabled
7 | - Private channels
8 | - Public Groups
9 | - Private Groups
10 |
11 | It's also not possible to retrieve comments, since they're made on a group.
12 |
13 | ## Installing
14 |
15 |
16 | ```shell
17 | pip install tchan # Python library only
18 | pip install tchan[cli] # Library + CLI
19 | ```
20 |
21 | ## Using as a libray
22 |
23 | ```python
24 | from tchan import ChannelScraper
25 |
26 | scraper = ChannelScraper()
27 | for message in scraper.messages("tchantest"):
28 | print(f"New message ({message.type}) from {message.channel}:")
29 | print(f" id={message.id}")
30 | print(f" created_at={message.created_at.isoformat()}")
31 | print(f" text={message.text}")
32 | # TODO: add more parameters
33 | ```
34 |
35 | ## Using as a command-line tool
36 |
37 | Scrape one or many channels and save all messages to `messages.csv`:
38 |
39 | ```shell
40 | tchan messages.csv channel1 [channel2 ... channelN]
41 | ```
42 |
43 | ## Tests
44 |
45 | To run all tests, execute:
46 |
47 | ```shell
48 | make test # or just `pytest`
49 | ```
50 |
51 | Make sure to install development requirements.
52 |
53 | Tests were made on a channel created for this task:
54 | [tchantest](https://t.me/tchantest).
55 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = tchan
3 | version = 0.1.4
4 | description = Scrape Telegram public channels (Python library + CLI)
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown
7 | url = https://github.com/PythonicCafe/tchan/
8 | keywords = telegram scraping social-media
9 | author = Álvaro Justen
10 | author_email = alvarojusten@gmail.com
11 | license = GNU Lesser General Public License v3 (LGPLv3)
12 | classifiers =
13 | Intended Audience :: Developers
14 | License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)
15 | Operating System :: OS Independent
16 | Programming Language :: Python
17 | Programming Language :: Python :: 3
18 | Programming Language :: Python :: 3 :: Only
19 | Programming Language :: Python :: 3.7
20 | Programming Language :: Python :: 3.8
21 | Programming Language :: Python :: 3.9
22 | Programming Language :: Python :: 3.10
23 | Programming Language :: Python :: 3.11
24 | Topic :: Internet :: WWW/HTTP
25 | Topic :: Internet :: WWW/HTTP :: Dynamic Content
26 |
27 | [options]
28 | include_package_data = true
29 | py_modules = tchan
30 | python_requires = >=3.7
31 | install_requires =
32 | lxml
33 | requests
34 |
35 | [options.extras_require]
36 | cli =
37 | loguru
38 | tqdm
39 | dev =
40 | autoflake
41 | black
42 | flake8
43 | ipython
44 | isort
45 | pytest
46 | twine
47 | wheel
48 |
49 | [options.packages.find]
50 | exclude =
51 | data*
52 | test*
53 | Makefile
54 |
55 | [options.entry_points]
56 | console_scripts =
57 | tchan = tchan:main
58 |
59 | [flake8]
60 | max-line-length = 80
61 | exclude = .tox,.git,*/migrations/*,*/static/CACHE/*,docs,node_modules,docker/data/*
62 | ignore=I001,I003,I004,E231,E501,W503
63 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup()
4 |
--------------------------------------------------------------------------------
/tchan.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import re
3 | from dataclasses import asdict, dataclass
4 | from pathlib import Path
5 | from typing import List
6 | from urllib.parse import urljoin, urlparse
7 |
8 | import requests
9 | from lxml.html import document_fromstring
10 |
11 |
12 | __version__ = "0.1.4"
13 | REGEXP_BACKGROUND_IMAGE_URL = re.compile(r"background-image:url\('(.*)'\)")
14 |
15 |
16 | def extract_bg_img(style):
17 | url = REGEXP_BACKGROUND_IMAGE_URL.findall(style)[0]
18 | if url.startswith("//"):
19 | url = f"https:{url}"
20 | return url
21 |
22 | def convert_int(value):
23 | if value.endswith("M"):
24 | return int(float(value[:-1]) * 1_000_000)
25 | elif value.endswith("K"):
26 | return int(float(value[:-1]) * 1_000)
27 | else:
28 | return int(value)
29 |
30 |
31 | @dataclass
32 | class ChannelMessage:
33 | id: int
34 | created_at: datetime.datetime
35 | type: str
36 | channel: str
37 | edited: bool
38 | urls: List[str]
39 | author: str = None
40 | text: str = None
41 | views: int = None
42 | reply_to_id: int = None
43 | preview_url: str = None
44 | preview_image_url: str = None
45 | preview_site_name: str = None
46 | preview_title: str = None
47 | preview_description: str = None
48 | forwarded_author: str = None
49 | forwarded_author_url: str = None
50 |
51 |
52 | @dataclass
53 | class ChannelInfo:
54 | username: str
55 | title: str
56 | image_url: str
57 | description: str = None
58 | subscribers: int = None
59 | photos: int = None
60 | videos: int = None
61 | links: int = None
62 |
63 |
64 | def normalize_url(username_or_url):
65 | """Normalize username or URL to a channel canonical URL"""
66 | path = urlparse(username_or_url).path
67 | if path.startswith("t.me/"):
68 | path = path[4:]
69 | if path.startswith("/s/"):
70 | path = path[2:]
71 | if path.startswith("/"):
72 | path = path[1:]
73 | if path.startswith("@"):
74 | path = path[1:]
75 | return urljoin("https://t.me/s/", path.split("/")[0])
76 |
77 |
78 | def extract_text(parts, delimiter="\n"):
79 | return delimiter.join(
80 | item.strip() for item in parts if item.strip()
81 | ).strip()
82 |
83 |
84 | def parse_info(tree):
85 | username = extract_text(
86 | tree.xpath(
87 | "//div[@class = 'tgme_channel_info_header_username']//text()"
88 | ),
89 | delimiter="",
90 | )
91 | if username[0] == "@":
92 | username = username[1:]
93 | title_text = tree.xpath("//meta[@property = 'og:title']/@content")
94 | title = title_text[0] if title_text else None
95 | image_url_text = tree.xpath("//meta[@property = 'og:image']/@content")
96 | image_url = image_url_text[0] if image_url_text else None
97 | description_text = tree.xpath(
98 | "//meta[@property = 'og:description']/@content"
99 | )
100 | description = description_text[0] if description_text else None
101 | counters = {}
102 | counters_div = tree.xpath("//div[@class = 'tgme_channel_info_counters']")[0]
103 | for counter_div in counters_div.xpath(
104 | ".//div[@class = 'tgme_channel_info_counter']"
105 | ):
106 | key = counter_div.xpath(".//span[@class = 'counter_type']/text()")[0]
107 | value = convert_int(
108 | counter_div.xpath(".//span[@class = 'counter_value']/text()")[0]
109 | )
110 | counters[key] = value
111 |
112 | return ChannelInfo(
113 | username=username,
114 | title=title,
115 | image_url=image_url,
116 | description=description,
117 | subscribers=counters.get("subscriber"),
118 | photos=counters.get("photos"),
119 | videos=counters.get("videos"),
120 | links=counters.get("links"),
121 | )
122 |
123 |
124 | def parse_messages(original_url, tree):
125 | "Retrieve messages from HTML tree"
126 | messages = tree.xpath("//div[contains(@class, 'tgme_widget_message_wrap')]")
127 | for message in reversed(messages):
128 | if message.xpath(".//div[contains(@class, 'tme_no_messages_found')]"):
129 | # XXX: this case may happen because a great number of requests was
130 | # made and Telegram sent this response as if there were no new
131 | # posts when actually there are.
132 | return
133 | channel, id_ = message.xpath(".//div/@data-post")[0].split("/")
134 | created_at = datetime.datetime.fromisoformat(
135 | message.xpath(".//time/@datetime")[0]
136 | )
137 | edited_text = message.xpath(
138 | ".//span[@class = 'tgme_widget_message_meta']/text()"
139 | )
140 | edited = "edited" in edited_text[0].strip() if edited_text else False
141 | author_text = message.xpath(
142 | ".//span[@class = 'tgme_widget_message_from_author']/text()"
143 | )
144 | author = author_text[0] if author_text else None
145 | text, views, type_, reply_to_id, urls = None, None, None, None, []
146 | forwarded_author, forwarded_author_url = None, None
147 | (
148 | preview_url,
149 | preview_image_url,
150 | preview_site_name,
151 | preview_title,
152 | preview_description,
153 | ) = (None, None, None, None, None)
154 | text_div_list = message.xpath(
155 | ".//div[contains(@class, 'tgme_widget_message_text')]"
156 | )
157 | text_div = text_div_list[0] if text_div_list else None
158 | if message.xpath(".//div[contains(@class, 'service_message')]"):
159 | text = extract_text(text_div.xpath(".//text()"), delimiter="")
160 | type_ = "service"
161 | image_url_text = message.xpath(
162 | ".//a[@class = 'tgme_widget_message_service_photo']/img/@src"
163 | )
164 | if image_url_text:
165 | urls.append(("photo", urljoin(original_url, image_url_text[0])))
166 |
167 | else:
168 | views_text = extract_text(
169 | message.xpath(
170 | ".//span[contains(@class, 'tgme_widget_message_views')]//text()"
171 | ),
172 | delimiter="",
173 | )
174 | if views_text:
175 | views = convert_int(views_text)
176 | if text_div is not None:
177 | text = extract_text(text_div.xpath(".//text()"), delimiter="\n")
178 | emoji_style_text = text_div.xpath(
179 | ".//i[@class = 'emoji']/@style"
180 | )
181 | if emoji_style_text:
182 | urls.append(
183 | (
184 | "photo",
185 | urljoin(
186 | original_url,
187 | extract_bg_img(emoji_style_text[0]),
188 | ),
189 | )
190 | )
191 | else:
192 | sticker_div_list = message.xpath(
193 | ".//div[contains(@class, 'tgme_widget_message_sticker_wrap')]//i[contains(@class, 'tgme_widget_message_sticker')]/@data-webp"
194 | )
195 | if sticker_div_list:
196 | # TODO: add option to get sticker data from:
197 | # message.xpath(".//i[contains(@class, 'tgme_widget_message_sticker')]/@style")[0]
198 | type_ = "sticker"
199 | urls.append(
200 | ("photo", urljoin(original_url, sticker_div_list[0]))
201 | )
202 |
203 | location_a_list = message.xpath(
204 | ".//a[@class = 'tgme_widget_message_location_wrap']/@href"
205 | )
206 | if location_a_list:
207 | type_ = "location"
208 | urls.append(
209 | ("link", urljoin(original_url, location_a_list[0]))
210 | )
211 |
212 | audio_src_list = message.xpath(".//audio/@src")
213 | if audio_src_list:
214 | # TODO: add duration to dataclass?
215 | # duration = extract_text(
216 | # message.xpath(
217 | # ".//time[contains(@class, 'tgme_widget_message_voice_duration')]/text()"
218 | # )[0],
219 | # delimiter="",
220 | # )
221 | type_ = "audio"
222 | urls.append(
223 | ("audio", urljoin(original_url, audio_src_list[0]))
224 | )
225 |
226 | document_class_list = message.xpath(
227 | ".//div[contains(@class, 'tgme_widget_message_document')]/@class"
228 | )
229 | if document_class_list:
230 | # TODO: get title, document type and other info
231 | type_ = "document"
232 |
233 | poll_div_list = message.xpath(
234 | ".//div[contains(@class, 'tgme_widget_message_poll')]"
235 | )
236 | if poll_div_list:
237 | # TODO: get other info
238 | type_ = "poll"
239 |
240 | photos_div_list = message.xpath(
241 | ".//a[contains(@class, 'tgme_widget_message_photo_wrap')]/@style"
242 | )
243 | if photos_div_list:
244 | urls.extend(
245 | [
246 | ("photo", urljoin(original_url, extract_bg_img(style)))
247 | for style in photos_div_list
248 | ]
249 | )
250 | type_ = "photo" if type_ is None else "multimedia"
251 |
252 | roundvideos_div_list = message.xpath(
253 | ".//video[contains(@class, 'tgme_widget_message_roundvideo')]/@src"
254 | )
255 | if roundvideos_div_list:
256 | # TODO: get video duration?
257 | urls.extend(
258 | [
259 | ("round-video", urljoin(original_url, url))
260 | for url in roundvideos_div_list
261 | ]
262 | )
263 | type_ = "round-video" if type_ is None else "multimedia"
264 |
265 | video_link_list = message.xpath(
266 | "//a[contains(@class, 'tgme_widget_message_video_player')]"
267 | )
268 | if video_link_list:
269 | type_ = "video" if type_ is None else "multimedia"
270 | videos_div_list = message.xpath(
271 | ".//div[contains(@class, 'tgme_widget_message_video_wrap')]//video[contains(@class, 'tgme_widget_message_video')]/@src"
272 | )
273 | if videos_div_list:
274 | # TODO: get video duration?
275 | urls.extend(
276 | [
277 | ("video", urljoin(original_url, url))
278 | for url in videos_div_list
279 | ]
280 | )
281 |
282 | reply_list = message.xpath(
283 | ".//a[contains(@class, 'tgme_widget_message_reply')]/@href"
284 | )
285 | if reply_list:
286 | reply_to_id = int(reply_list[0].split("/")[-1])
287 |
288 | a_preview_list = message.xpath(
289 | ".//a[contains(@class, 'tgme_widget_message_link_preview')]"
290 | )
291 | if a_preview_list:
292 | a_tag = a_preview_list[0]
293 | url_preview = a_tag.xpath("./@href")
294 | preview_url = url_preview[0] if url_preview else None
295 | image_preview = a_tag.xpath(
296 | ".//i[contains(@class, 'link_preview_')]/@style"
297 | )
298 | preview_image_url = (
299 | extract_bg_img(image_preview[0]) if image_preview else None
300 | )
301 | preview_site_name = (
302 | extract_text(
303 | a_tag.xpath(
304 | ".//div[contains(@class, 'link_preview_site_name')]//text()"
305 | )
306 | )
307 | or None
308 | )
309 | preview_title = (
310 | extract_text(
311 | a_tag.xpath(
312 | ".//div[contains(@class, 'link_preview_title')]//text()"
313 | )
314 | )
315 | or None
316 | )
317 | preview_description = (
318 | extract_text(
319 | a_tag.xpath(
320 | ".//div[contains(@class, 'link_preview_description')]//text()"
321 | )
322 | )
323 | or None
324 | )
325 |
326 | if text_div is not None:
327 | # TODO: parse spoilers?
328 | # TODO: how to know for which text the link is?
329 | if link_list := text_div.xpath(".//a/@href"):
330 | urls.extend(
331 | [
332 | ("link", urljoin(original_url, url))
333 | for url in link_list
334 | ]
335 | )
336 |
337 | a_fwd_list = message.xpath(
338 | ".//a[contains(@class, 'tgme_widget_message_forwarded_from_name')]"
339 | )
340 | if a_fwd_list:
341 | forwarded_author = extract_text(
342 | a_fwd_list[0].xpath(".//text()")
343 | )
344 | forwarded_author_url = a_fwd_list[0].xpath("./@href")[0]
345 |
346 | if type_ is None:
347 | type_ = "text"
348 |
349 | for thumb_type in ("reply", "video", "roundvideo"):
350 | query = f".//i[contains(@class, 'tgme_widget_message_{thumb_type}_thumb')]/@style"
351 | urls.extend(
352 | [
353 | (
354 | f"thumbnail-{thumb_type}",
355 | urljoin(original_url, extract_bg_img(style)),
356 | )
357 | for style in message.xpath(query)
358 | ]
359 | )
360 |
361 | # TODO: parse live location
362 | # TODO: parse poll
363 | # TODO: parse document/audio
364 | # TODO: parse document/other
365 | yield ChannelMessage(
366 | id=int(id_),
367 | created_at=created_at,
368 | type=type_,
369 | channel=channel,
370 | author=author,
371 | edited=edited,
372 | text=text,
373 | views=views,
374 | urls=urls,
375 | reply_to_id=reply_to_id,
376 | preview_url=preview_url,
377 | preview_image_url=preview_image_url,
378 | preview_site_name=preview_site_name,
379 | preview_title=preview_title,
380 | preview_description=preview_description,
381 | forwarded_author=forwarded_author,
382 | forwarded_author_url=forwarded_author_url,
383 | )
384 |
385 |
386 | class ChannelScraper:
387 | def __init__(self, user_agent=f"tchan/{__version__}"):
388 | self.session = requests.Session()
389 | self.session.headers["User-Agent"] = user_agent
390 |
391 | def info(self, username_or_url):
392 | url = normalize_url(username_or_url)
393 | response = self.session.get(url)
394 | tree = document_fromstring(response.text)
395 | return parse_info(tree)
396 |
397 | def messages(self, username_or_url):
398 | "Get messages from a channel, paginating until it ends"
399 | url = normalize_url(username_or_url)
400 |
401 | last_captured_id = None
402 | while True:
403 | response = self.session.get(url)
404 | tree = document_fromstring(response.text)
405 | for message in parse_messages(url, tree):
406 | last_captured_id = message.id
407 | yield message
408 | next_page_url = tree.xpath("//link[@rel = 'prev']/@href")
409 | if not next_page_url:
410 | if last_captured_id is not None and message.id > 20:
411 | # Telegram did not respond correctly, try again
412 | url = (
413 | normalize_url(username_or_url)
414 | + f"?before={last_captured_id}"
415 | )
416 | continue
417 | break
418 | url = urljoin(url, next_page_url[0])
419 |
420 | def main():
421 | import argparse
422 | import csv
423 | import json
424 | from pathlib import Path
425 |
426 | try:
427 | from loguru import logger
428 | from tqdm import tqdm
429 | except ImportError:
430 | print("Error - you muse install CLI dependencies with:")
431 | print(" pip install tchan[cli]")
432 | exit(1)
433 |
434 | parser = argparse.ArgumentParser()
435 | parser.add_argument("csv_filename")
436 | parser.add_argument("username_or_url", nargs="+")
437 | args = parser.parse_args()
438 | # TODO: add option to limit messages (--max=N, --until=datetime, --after=datetime etc.)
439 | # TODO: implement `urls_format`: postgres_array, json, multiline
440 | usernames_or_urls = args.username_or_url
441 | filename = Path(args.csv_filename)
442 | if not filename.parent.exists():
443 | filename.parent.mkdir(parents=True)
444 |
445 | scraper = ChannelScraper()
446 | with filename.open(mode="w") as fobj:
447 | progress = tqdm(unit=" posts", unit_scale=True, dynamic_ncols=True)
448 | scrape_count, writer = 0, None
449 | for username_or_url in usernames_or_urls:
450 | username = normalize_url(username_or_url).replace(
451 | "https://t.me/s/", ""
452 | )
453 | progress.desc = f"Scraping {username}"
454 | try:
455 | for message in scraper.messages(username):
456 | message = asdict(message)
457 | message["urls"] = json.dumps(message["urls"])
458 | if writer is None:
459 | writer = csv.DictWriter(
460 | fobj, fieldnames=list(message.keys())
461 | )
462 | writer.writeheader()
463 | writer.writerow(message)
464 | progress.update()
465 |
466 | except StopIteration: # Group, bot or invalid username
467 | logger.error(
468 | "Invalid username or not a public channel: {username}"
469 | )
470 | continue
471 | else:
472 | scrape_count += 1
473 | progress.desc = (
474 | f"Scraped {scrape_count} user{'s' if scrape_count > 1 else ''}"
475 | )
476 | progress.close()
477 |
478 |
479 | if __name__ == "__main__":
480 | main()
481 |
--------------------------------------------------------------------------------
/test_tchan.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from lxml.html import document_fromstring
4 |
5 | from tchan import (
6 | ChannelInfo,
7 | ChannelMessage,
8 | normalize_url,
9 | parse_info,
10 | parse_messages,
11 | )
12 |
13 | original_url = "https://t.me/s/tchantest"
14 |
15 |
16 | def test_normalize_url():
17 | assert normalize_url("https://t.me/fulano") == "https://t.me/s/fulano"
18 | assert normalize_url("https://t.me/s/fulano") == "https://t.me/s/fulano"
19 | assert normalize_url("https://t.me/fulano/12345") == "https://t.me/s/fulano"
20 | assert (
21 | normalize_url("https://t.me/s/fulano/12345") == "https://t.me/s/fulano"
22 | )
23 | assert normalize_url("t.me/fulano") == "https://t.me/s/fulano"
24 | assert normalize_url("t.me/s/fulano") == "https://t.me/s/fulano"
25 | assert normalize_url("t.me/s/fulano/12345") == "https://t.me/s/fulano"
26 | assert normalize_url("fulano") == "https://t.me/s/fulano"
27 | assert normalize_url("@fulano") == "https://t.me/s/fulano"
28 |
29 |
30 | def test_channel_info():
31 | html = """
32 | [...]
33 |
34 |
35 |
36 |
37 | [...]
38 |
39 | [...]
40 | 1 subscriber
3 photos
2 videos
4 links
41 | [...]
42 | """
43 | tree = document_fromstring(html)
44 | result = parse_info(tree)
45 | expected = ChannelInfo(
46 | username="tchantest",
47 | title="tchan's test channel 👍",
48 | image_url="https://cdn1.telegram-cdn.org/file/pEJs58u1vQ4-YvOJ-6t1MAIcTPNIusLkfFzACh2CHzG-IOGGZVSKNsNIJhO-bkTdyAIabgzH7RqJBEjPLDWkJT7IYoQeCiDehrk1-KNRuXEgbCHMWDSxMuc9mOp-w3TJkfzLserjAsgwqVKE4fb0NouctjkVJHMcPkwxUVdoiEwEc6cUPP16fYQJfxKELtbBrfPpEha6Bdvfrhy2-6Sn3PPUx_krgiNduHJXXhc8zRcJt-YoOmX_McGV7EqZhEtDZHhRB2r441l4OJQzHjP7L-cA_y6g8cI1_hU7E8oLJJCoEdzHDrR2_z23MzjbHQ4F538BnqPEINvYBGJZP3h6Hg.jpg",
49 | description="Test channel for tchan Python library/CLI",
50 | subscribers=1,
51 | photos=3,
52 | videos=2,
53 | links=4,
54 | )
55 | assert result == expected
56 |
57 |
58 | def test_parse_service_message_channel_created():
59 | html = """
60 |
88 | """
89 | tree = document_fromstring(html)
90 | result = list(parse_messages(original_url, tree))
91 | expected = ChannelMessage(
92 | id=1,
93 | created_at=datetime.datetime(
94 | 2023, 2, 24, 7, 26, 49, tzinfo=datetime.timezone.utc
95 | ),
96 | type="service",
97 | channel="tchantest",
98 | urls=[],
99 | author=None,
100 | edited=False,
101 | text="Channel created",
102 | views=None,
103 | )
104 | assert result[0] == expected
105 |
106 |
107 | def test_parse_service_message_pinned():
108 | html = """
109 |
136 | """
137 | tree = document_fromstring(html)
138 | result = list(parse_messages(original_url, tree))
139 | expected = ChannelMessage(
140 | id=92,
141 | created_at=datetime.datetime(
142 | 2023, 2, 24, 12, 20, 19, tzinfo=datetime.timezone.utc
143 | ),
144 | type="service",
145 | channel="tchantest",
146 | urls=[],
147 | author=None,
148 | edited=False,
149 | text="tchan's test channel👍pinned «Going to pin this message»",
150 | views=None,
151 | )
152 | assert result[0] == expected
153 |
154 |
155 | def test_parse_multimedia_message():
156 | html = """
157 |
233 | """
234 | tree = document_fromstring(html)
235 | result = list(parse_messages(original_url, tree))
236 | expected = ChannelMessage(
237 | id=84,
238 | created_at=datetime.datetime(
239 | 2023, 2, 24, 11, 1, 46, tzinfo=datetime.timezone.utc
240 | ),
241 | type="multimedia",
242 | channel="tchantest",
243 | author=None,
244 | urls=[
245 | (
246 | "photo",
247 | "https://cdn1.telegram-cdn.org/file/CrsaR3dLCwSaTunI7l4nSUu5G7du7049yHXuZwbiOjAfSDEMrJaKiJ9-ly6RJOJf7wSYPGjpUppSBuqLkTbMIl_CMEpS_9nVLvNusCJTRXbhJbU4UFsTxiM89YyDE_9bapVEjoS9vhRS7qw1zSCbV2K42W3TZvvQ8scfiI2xiMIsRkw-YzpIVxbkzpeWz3-US6fi7DswlIobEgCG0uxmHdr6q2FEFOn9BCpfQHlrDqq8rCA9kBteMinAEkALObzktjJ76PMFWQZbQCcKIofW9oOThEJRAdFrRaho9PwBOQIcrSf_2MQmyqg8zA79k04ME76FsNNw3xr7xA160MHckA.jpg",
248 | ),
249 | (
250 | "photo",
251 | "https://cdn1.telegram-cdn.org/file/QMvda1Z9IET0DA4-jsfevuguTxnTpn6omRRA7gfRfYs6VCmxjsZdeX4R6k4n7rYe7skLclk-A1G7Dw4UuYzV5Ogj9KkYlLSsJD2x_WQDRqmUDauUWEAGa-JNACCFBH2zRJeFwb6OvAD6itpb05MbvyMVvsR18sw8Qe4VcPvODZmfKGxQ7ioslSsxTKoLz8KEAIcvPqVLkJKjkPVERS6u8QBSS-ZPOWu6RvJsS0_fS7oCTSfLjTmn3EfGlU3BvRAmNcVCQv7Jz-CPNBgXHUk5Bru05WNpsdaoCVfX9PzeDycLIoHm5S55H_TV9zocKvg_ZX0NVyLE_Em_wO7nbDwDRQ.jpg",
252 | ),
253 | (
254 | "video",
255 | "https://cdn1.telegram-cdn.org/file/bd7ceb0b41.mp4?token=FqRn4LGTlVsYQNCiPqd19TF0Z-EK6gWw__1F7os3qthEAdi8kkkrrWeUqJYCrNsv7OF80hlKUHUfKi_9PMsFbg28A8zONLdULTjmt256uaIBQqc3ieY_Js1kg8ezAIi5Y2tab_Z9dycZBrPQjARRoTVwLd9v_QrIQ7il3W1iXipnClgLZK4PxFjFOIhdYmEM_gAhxylo30BdpQFekkk5xCwNNTKmFjeaZTUQ_NNM1k3NGuKQo3SmrHUj4hK1xoKayOZxvB0oJIffTpMkHEBjs_j-tyi72uKmRz7my8ogBY0pHO-2Em5lDh-Li0rDGfHJ_NUssmwGqoToH-el_FZfZWA7juyZs_QNSkWABs0ijiVXDKZPKZKiOVQ0Uh8RtNs4UrNKp91GAVLLCb-C9TpW9SRlgMECG4yKWnXZ7nIsh_nhRNMk-aL0_Lehl0D4bRi1xMkXqtCu5B9pMS7MhIoNbg1sK7_4SJi3YRLugH_jiOGx3HhOP9HtBQZZTBZffb_d3vQQHNfXWEYUwP3t8iBwDrFL0qPwlT_1xMM57S6JlS3ApAEqqYFRxD6YBBFcJr3HMb2rinmL77iwCPtOirSeXFNaG0daSIaUn2jOMoDVH-MwVMBr6Oz_ESOWlqiGtHngUxKxgsbC9UxgvzcXXzHNelvl0z8isMf6oz0QqAntnWA",
256 | ),
257 | (
258 | "video",
259 | "https://cdn1.telegram-cdn.org/file/e996ca12fa.mp4?token=KO0u7GXRUAGbO8QgNdwm-ZedWF5dNzcqm4VeQk_2XikjDahyMnyWKup0S9kyPO4piQqxDlK0yDsFa-myEr1LQPNnJdd1KqYUTiUGWgwI9d-9cA9d-J1U8mtiWDDiaLctgP73nFHrVNbpBopQELyGobP5ha5ofRzEC494a6QHkcKAakFRWlkMu2u2n_HAQZQhurOVvmJBW0pA_yMIv_lrVsHfmvjw-jGwd_dnou5l-158l_0i21I8jzjBMJ4bam9ayHgn3iEjw0uDcmmb-I9-i7Nz1vPQjRJs34_Qjyp6vDrawOGsOY0sAWT7r4lSpefK1Rdc0XQPwQgMY_izO_QfWw",
260 | ),
261 | (
262 | "thumbnail-video",
263 | "https://cdn1.telegram-cdn.org/file/nWyXAzTHX19MAA8oKaS4tcW-afBU7oOvCpQfVX-Z8l5dVuBeBUDZ5wVn0LRN5OIkwxbDAoSVuJiVm8BPt__9PML4K9rBmVcinjrLUtbNcqhx1WF-JpR_jd5FGQXQStSPYwgYyf460aoQsrVu48r-h4UGyuRLcnNoXytG-siQ_kOxpMArdeZXIQRzNW9eWHfO-IaIx78jqLdhcUaueZjkix91Ak35tPDdloQ7vhzDGqtpJQ1bI6YWVVP0gAm7AQdqcmkoNfFwXjh4uvxEAvsJxdSfMNJ3A1uKhqkm-_fVcbUBgBcJoPIjBBpJx5A3yorSMoMFCvne1QIvFdfjNbGZbg",
264 | ),
265 | (
266 | "thumbnail-video",
267 | "https://cdn1.telegram-cdn.org/file/jFd1zyUf89Ze7MO2jtuFOuJXVJrYf9-QNmuE_f0x07Nwkx6KgVXhq71eAMiuTYhg1T9lmLA74NfOWgLwLaNk5H4OZYUtQBGrRrGeFaOcnRDcv9jOb23ZAjj6BHMDJ3bfFh_lmsAKQzQIuhJesbi3kBeioa4BeVYW4qjRYUmoKuRYpH6kr2eAalOQ_IYV8p0RxqGbhrJO3vKSYhwodxb3lYI-RKLDUrhSGQUy43hKEt1Epb0rwnpXnXsHcTrbo96O-bqfZ20A_wPqgzMyA13Xa7guojvXlRD2bpd-ezUYdwh9yBb636oqfkvTkaEI8rH4azIJxSB5eVlug5cN2wk97g",
268 | ),
269 | ],
270 | edited=False,
271 | text="Multiple videos and pictures",
272 | views=436_600,
273 | )
274 | assert result[0] == expected
275 |
276 |
277 | def test_parse_service_message_channel_video_changed():
278 | html = """
279 |
306 | """
307 | tree = document_fromstring(html)
308 | result = list(parse_messages(original_url, tree))
309 | expected = ChannelMessage(
310 | id=82,
311 | created_at=datetime.datetime(
312 | 2023, 2, 24, 10, 15, 33, tzinfo=datetime.timezone.utc
313 | ),
314 | type="text",
315 | channel="tchantest",
316 | author=None,
317 | urls=[],
318 | edited=False,
319 | text='+ "Channel video changed"',
320 | views=3_100_000,
321 | )
322 | assert result[0] == expected
323 |
324 |
325 | def test_parse_service_message_channel_name_changed():
326 | html = """
327 |
354 | """
355 | tree = document_fromstring(html)
356 | result = list(parse_messages(original_url, tree))
357 | expected = ChannelMessage(
358 | id=2,
359 | created_at=datetime.datetime(
360 | 2023, 2, 24, 7, 28, 1, tzinfo=datetime.timezone.utc
361 | ),
362 | type="service",
363 | channel="tchantest",
364 | author=None,
365 | urls=[],
366 | edited=False,
367 | text="Channel name was changed to «tchan's test channel»",
368 | views=None,
369 | )
370 | assert result[0] == expected
371 |
372 |
373 | def test_parse_service_message_channel_photo_updated():
374 | html = """
375 |
403 | """
404 | tree = document_fromstring(html)
405 | result = list(parse_messages(original_url, tree))
406 | expected = ChannelMessage(
407 | id=3,
408 | created_at=datetime.datetime(
409 | 2023, 2, 24, 7, 29, 23, tzinfo=datetime.timezone.utc
410 | ),
411 | type="service",
412 | channel="tchantest",
413 | author=None,
414 | edited=False,
415 | text="Channel photo updated",
416 | views=None,
417 | urls=[
418 | (
419 | "photo",
420 | "https://cdn1.telegram-cdn.org/file/DL-kcK51w4o7tyr5QWQWK7YexgMdwiKIVYbDNbzB2qUtyk9uYfrKo0t19LY08bW4WTdmGpI9t0YQ2aU3RpsaWVk_4Q9QfjBIjaM894tj1r96LzJ8PGXOLkHd3w_KDciIw-AFmZBAKs5UIK6WU6PW1Nx1uh9e084u9rKJQtVu7EZLx1YCgxtx5R69qSKCamUbie0yqbaocYeevtymiMw6C_BeYwLZux6iMhoejvs6jyaQXiQLtm53xvAcqPKefzM0frCmDU1t5sllrHJD7L2iv52m9j27Kcyi-cu6detpDOwxdC2Be9CvsN4UXDOwvxiEl3TQSkrFKb06csVd85lEaQ.jpg",
421 | ),
422 | ],
423 | )
424 | assert result[0] == expected
425 |
426 |
427 | def test_parse_text_message_multiline():
428 | html = """
429 |
456 | """
457 | tree = document_fromstring(html)
458 | result = list(parse_messages(original_url, tree))
459 | expected = ChannelMessage(
460 | id=62,
461 | created_at=datetime.datetime(
462 | 2023, 2, 24, 8, 1, 26, tzinfo=datetime.timezone.utc
463 | ),
464 | type="text",
465 | channel="tchantest",
466 | urls=[],
467 | author=None,
468 | edited=False,
469 | text="Bigger\ntext\nmessage\nnumber\n34",
470 | views=2,
471 | )
472 | assert result[0] == expected
473 |
474 |
475 | def test_parse_forwarded_text_message():
476 | html = """
477 |
506 | """
507 | tree = document_fromstring(html)
508 | result = list(parse_messages(original_url, tree))
509 | expected = ChannelMessage(
510 | id=89,
511 | created_at=datetime.datetime(
512 | 2023, 2, 24, 12, 13, 31, tzinfo=datetime.timezone.utc
513 | ),
514 | type="text",
515 | channel="tchantest",
516 | urls=[],
517 | author=None,
518 | edited=False,
519 | text=";)",
520 | views=2,
521 | forwarded_author="Some user",
522 | forwarded_author_url="https://t.me/some_user",
523 | )
524 | assert result[0] == expected
525 |
526 |
527 | def test_parse_text_message_signed_not_edited():
528 | html = """
529 |
556 | """
557 | tree = document_fromstring(html)
558 | result = list(parse_messages(original_url, tree))
559 | expected = ChannelMessage(
560 | id=79,
561 | created_at=datetime.datetime(
562 | 2023, 2, 24, 8, 7, 51, tzinfo=datetime.timezone.utc
563 | ),
564 | type="text",
565 | channel="tchantest",
566 | urls=[],
567 | author="Álvaro Justen",
568 | edited=False,
569 | text="Signed and not edited",
570 | views=2,
571 | )
572 | assert result[0] == expected
573 |
574 |
575 | def test_parse_text_message_link_no_preview():
576 | html = """
577 |
604 | """
605 | tree = document_fromstring(html)
606 | result = list(parse_messages(original_url, tree))
607 | expected = ChannelMessage(
608 | id=24,
609 | created_at=datetime.datetime(
610 | 2023, 2, 24, 7, 48, 2, tzinfo=datetime.timezone.utc
611 | ),
612 | type="text",
613 | channel="tchantest",
614 | urls=[("link", "https://brasil.io/")],
615 | author=None,
616 | edited=True,
617 | text="This is a message with a link\nhttps://brasil.io/",
618 | views=2,
619 | )
620 | assert result[0] == expected
621 |
622 |
623 | def test_parse_text_message_link_with_regular_preview():
624 | html = """
625 |
661 | """
662 | tree = document_fromstring(html)
663 | result = list(parse_messages(original_url, tree))
664 | expected = ChannelMessage(
665 | id=94,
666 | created_at=datetime.datetime(
667 | 2023, 2, 24, 13, 47, 38, tzinfo=datetime.timezone.utc
668 | ),
669 | type="text",
670 | channel="tchantest",
671 | urls=[
672 | (
673 | "link",
674 | "https://agenciabrasil.ebc.com.br/justica/noticia/2022-08/operacao-guardioes-do-bioma-apreende-239-toneladas-de-minerio",
675 | )
676 | ],
677 | author=None,
678 | edited=False,
679 | text="https://agenciabrasil.ebc.com.br/justica/noticia/2022-08/operacao-guardioes-do-bioma-apreende-239-toneladas-de-minerio",
680 | views=1,
681 | preview_url="https://agenciabrasil.ebc.com.br/justica/noticia/2022-08/operacao-guardioes-do-bioma-apreende-239-toneladas-de-minerio",
682 | preview_image_url="https://cdn4.telegram-cdn.org/file/C2HB_eIeUYDV3yo1xut1cz6d06v6ITJW1fnuT7uihR5nIzUG9unDbq_cCbJag2TFdg7C_uJReq5lTcu9HZHI88el4u17YROLNW-rm4nLJCGc9d7L8Pfvkf4wLEx7pfY32k68VOXqg3XQ3Y0M1HgiEZyz9IIY9WvqImvvgwWG5f_czeIe8cC8h_X7JAkwUnoNsPlPf6qzqfV5QBswqQQF0PoRzYxd3L-uLAAreSvahHIFTnhLWQZCNQXxucYd9-Ct-w6voFkGtBkpF68Tx5i6QdTFbWp6WqR4LR7BeNTdTgeMGoZN1x46I_maRaeeHyqDRqD9cJLl08hMH6NLtMKbkw.jpg",
683 | preview_site_name="Agência Brasil",
684 | preview_title="Operação Guardiões do Bioma apreende 23,9 toneladas de minério",
685 | preview_description="Ação contra o garimpo ilegal em Terra Indígena Yanomami durou um mês e resultou na prisão de 25 pessoas, apreensão de aeronaves e munições e em 115 autos de infração.",
686 | )
687 | assert result[0] == expected
688 |
689 |
690 | def test_parse_text_message_link_with_right_preview():
691 | html = """
692 |
728 | """
729 | tree = document_fromstring(html)
730 | result = list(parse_messages(original_url, tree))
731 | expected = ChannelMessage(
732 | id=81,
733 | created_at=datetime.datetime(
734 | 2023, 2, 24, 8, 16, 18, tzinfo=datetime.timezone.utc
735 | ),
736 | type="text",
737 | channel="tchantest",
738 | urls=[("link", "https://python.org/")],
739 | author=None,
740 | edited=False,
741 | text="link with preview\nhttps://python.org/",
742 | views=2,
743 | preview_url="https://www.python.org/",
744 | preview_image_url="https://cdn4.telegram-cdn.org/file/mcgzW-avL5x2aBHAMw_8xb-MEiP5rCavDScU8vCkIIiYDgc202XMtQ4daRGRZVGU8uIHWwOyWa-Io-NeHkdrbj87eaHQCMgH6t6T4cVrW5GUwQDuFgQpE7-7XFXWc2I_ffYrhqgZUqHfdJNIMovjz7H1i-Gk45e-rlFKlpb1bUOaOd07ISTdr1OCUSAbs7z6oofThWpyE_2AxA5upuupuiocaeMINNxnwnJ_ate8S3gvnGMq81trLqLtcrUI9Dlo1Na4QemQPH7IOz-ra6DhlyiHm6fb_Q0pDOpLvmpI73jODW3H7QfBjp5htgN7dNMtxkGxw11-tCmVU6gRTIU-5w.jpg",
745 | preview_site_name="Python.org",
746 | preview_title="Welcome to Python.org",
747 | preview_description="The official home of the Python Programming Language",
748 | )
749 | assert result[0] == expected
750 |
751 |
752 | def test_parse_unsigned_text_message():
753 | html = """
754 |
782 | """
783 | tree = document_fromstring(html)
784 | result = list(parse_messages(original_url, tree))
785 | expected = ChannelMessage(
786 | id=6,
787 | created_at=datetime.datetime(
788 | 2023, 2, 24, 7, 30, 39, tzinfo=datetime.timezone.utc
789 | ),
790 | type="text",
791 | channel="tchantest",
792 | urls=[],
793 | author=None,
794 | edited=False,
795 | text="Unsigned message",
796 | views=2,
797 | )
798 | assert result[0] == expected
799 |
800 |
801 | def test_parse_emoji_message():
802 | html = """
803 |
830 | """
831 | tree = document_fromstring(html)
832 | result = list(parse_messages(original_url, tree))
833 | expected = ChannelMessage(
834 | id=7,
835 | created_at=datetime.datetime(
836 | 2023, 2, 24, 7, 30, 53, tzinfo=datetime.timezone.utc
837 | ),
838 | type="text",
839 | channel="tchantest",
840 | author=None,
841 | edited=False,
842 | urls=[("photo", "https://telegram.org/img/emoji/40/F09F918D.png")],
843 | text="👍",
844 | views=2,
845 | )
846 | assert result[0] == expected
847 |
848 |
849 | def test_parse_sticker_message():
850 | html = """
851 |
887 | """
888 | tree = document_fromstring(html)
889 | result = list(parse_messages(original_url, tree))
890 | expected = ChannelMessage(
891 | id=9,
892 | created_at=datetime.datetime(
893 | 2023, 2, 24, 7, 31, 6, tzinfo=datetime.timezone.utc
894 | ),
895 | type="sticker",
896 | channel="tchantest",
897 | author=None,
898 | edited=False,
899 | urls=[
900 | (
901 | "photo",
902 | "https://cdn1.telegram-cdn.org/file/5b5c6e1325.webp?token=chQuhI8SVanorZnNJ_PTvHtJR1UOPC_cIjPNCVXhhG40BqJ9cpBCgrQy0NazQTCWO7bG_6JyNI4mFboxXSTcZJvATVgKwRTEkzFzeVen9a5AaZV36NUk9AWXUWFOaAX6jY4fKMQ3Sq6hicdTU4OjX4SvrwX501-pRHzw7b-dXMPwymHqMNwE-eVpiFu827y32eSOulEDWMvg2LMpmsIpks0b7fXcO-V-JvGwDvMsjVRy82406A5zElMjdD6lgBXZy5Hg79AyyVMJOprENkM0DY0evphw3gmq5G7YreJ9EcWIPX7K9skVfukCFxCxqjlHxV6T4aFGwxlZEJywBw_7pg",
903 | ),
904 | ],
905 | text=None,
906 | views=2,
907 | )
908 | assert result[0] == expected
909 |
910 |
911 | def test_parse_audio_document():
912 | html = """
913 |
950 | """
951 | tree = document_fromstring(html)
952 | result = list(parse_messages(original_url, tree))
953 | expected = ChannelMessage(
954 | id=12,
955 | created_at=datetime.datetime(
956 | 2023, 2, 24, 7, 33, 58, tzinfo=datetime.timezone.utc
957 | ),
958 | type="document",
959 | channel="tchantest",
960 | author=None,
961 | edited=False,
962 | urls=[],
963 | text="Povo hebreu",
964 | views=2,
965 | )
966 | assert result[0] == expected
967 |
968 |
969 | def test_parse_text_reply_to_video():
970 | html = """
971 |
972 |
1006 | """
1007 | tree = document_fromstring(html)
1008 | result = list(parse_messages(original_url, tree))
1009 | expected = ChannelMessage(
1010 | id=20,
1011 | created_at=datetime.datetime(
1012 | 2023, 2, 24, 7, 38, 31, tzinfo=datetime.timezone.utc
1013 | ),
1014 | type="text",
1015 | channel="tchantest",
1016 | author=None,
1017 | edited=False,
1018 | urls=[
1019 | (
1020 | "thumbnail-reply",
1021 | "https://cdn1.telegram-cdn.org/file/nWyXAzTHX19MAA8oKaS4tcW-afBU7oOvCpQfVX-Z8l5dVuBeBUDZ5wVn0LRN5OIkwxbDAoSVuJiVm8BPt__9PML4K9rBmVcinjrLUtbNcqhx1WF-JpR_jd5FGQXQStSPYwgYyf460aoQsrVu48r-h4UGyuRLcnNoXytG-siQ_kOxpMArdeZXIQRzNW9eWHfO-IaIx78jqLdhcUaueZjkix91Ak35tPDdloQ7vhzDGqtpJQ1bI6YWVVP0gAm7AQdqcmkoNfFwXjh4uvxEAvsJxdSfMNJ3A1uKhqkm-_fVcbUBgBcJoPIjBBpJx5A3yorSMoMFCvne1QIvFdfjNbGZbg",
1022 | ),
1023 | ],
1024 | text="Reply to a video (not recorded in telegram)",
1025 | views=2,
1026 | reply_to_id=19,
1027 | )
1028 | assert result[0] == expected
1029 |
1030 |
1031 | def test_parse_poll():
1032 | html = """
1033 |
1092 | """
1093 | tree = document_fromstring(html)
1094 | result = list(parse_messages(original_url, tree))
1095 | expected = ChannelMessage(
1096 | id=11,
1097 | created_at=datetime.datetime(
1098 | 2023, 2, 24, 7, 32, 23, tzinfo=datetime.timezone.utc
1099 | ),
1100 | type="poll",
1101 | channel="tchantest",
1102 | author=None,
1103 | edited=False,
1104 | urls=[],
1105 | text=None,
1106 | views=2,
1107 | )
1108 | assert result[0] == expected
1109 |
1110 |
1111 | def test_parse_photo_single():
1112 | html = """
1113 |
1144 | """
1145 | tree = document_fromstring(html)
1146 | result = list(parse_messages(original_url, tree))
1147 | expected = ChannelMessage(
1148 | id=16,
1149 | created_at=datetime.datetime(
1150 | 2023, 2, 24, 7, 35, 28, tzinfo=datetime.timezone.utc
1151 | ),
1152 | type="photo",
1153 | channel="tchantest",
1154 | author=None,
1155 | edited=False,
1156 | urls=[
1157 | (
1158 | "photo",
1159 | "https://cdn1.telegram-cdn.org/file/AqX5QJmpXiolNyLq3Aq8-4eqTHJtpueMZqcszNrmWGgUt4I4iaH-CPxmqR-QPdgdzVtE_rX8cpCOgeAOsN9Ais72d79W-56VIEOdCenSvm5YuK9nHh-faVhkQAnTnw3DtOobB6G3jbZRDvHdjhlxyojAwvNXWGSyfIzmaEPq9C_ut2VXo5gJk8ZOAUi5OfxIRf7UVSyNyXbfKXZPwDdok7uTLp1gTaLacKNdcHN0Wdo0NCj2phCO_Hhv5zOqvJkC_Ct-3d3vv_aa03gOWTXzGtq1sbrlynldi9zVtWn6TOsNBGzDGSWqcm7WUe1QbddpBeS7VQWqyih2fKmhKLr5wA.jpg",
1160 | ),
1161 | ],
1162 | text="Picture by telegram camera",
1163 | views=2,
1164 | )
1165 | assert result[0] == expected
1166 |
1167 |
1168 | def test_parse_message_weird_preview():
1169 | html = """
1170 |
1203 | """
1204 | tree = document_fromstring(html)
1205 | result = list(parse_messages("https://t.me/s/CamaradosDeputados", tree))
1206 | expected = ChannelMessage(
1207 | id=7334,
1208 | created_at=datetime.datetime(
1209 | 2022, 11, 9, 22, 43, 54, tzinfo=datetime.timezone.utc
1210 | ),
1211 | type="text",
1212 | channel="CamaradosDeputados",
1213 | author=None,
1214 | edited=True,
1215 | urls=[
1216 | ("photo", "https://telegram.org/img/emoji/40/E29C8D.png"),
1217 | ("link", "http://bit.ly/3Tq1C4G"),
1218 | (
1219 | "link",
1220 | "https://t.me/s/CamaradosDeputados?q=%23Puni%C3%A7%C3%A3o_ampliada",
1221 | ),
1222 | ("link", "http://bit.ly/3PCX1vn"),
1223 | ("link", "http://bit.ly/3tkdScB"),
1224 | (
1225 | "link",
1226 | "https://t.me/s/CamaradosDeputados?q=%23Poda_de_%C3%A1rvores",
1227 | ),
1228 | ("link", "http://bit.ly/3A1c4c4"),
1229 | ("link", "http://bit.ly/3A3EoKS"),
1230 | (
1231 | "link",
1232 | "https://t.me/s/CamaradosDeputados?q=%23Tarifas_de_transmiss%C3%A3o",
1233 | ),
1234 | ("link", "http://bit.ly/3zWxOpw"),
1235 | ("link", "https://bit.ly/2UBmLNC"),
1236 | ],
1237 | text="_\n✍️\nCÂMARA APROVA\n_\n🔸\nProjeto que aumenta penas para crimes sexuais contra crianças\n#Punição_ampliada\n🗳\nOpine sobre esta proposta\nclicando aqui!\n____\n🔸\nProjeto que facilita corte ou poda de árvore quando houver risco de acidente.\n#Poda_de_árvores\n🗳\nOpine sobre esta proposta\nclicando aqui!\n____\n🔸\nProposta que susta resoluções da Aneel sobre tarifas de transmissão\n.\n#Tarifas_de_transmissão\n🗳\nOpine sobre esta proposta\nclicando aqui!\n____\n📬\nConvide seus contatos do WhatsApp a conhecerem nosso canal\nclicando aqui!\n____",
1238 | views=1200,
1239 | preview_url="http://bit.ly/3Tq1C4G",
1240 | preview_image_url=None,
1241 | preview_site_name="Portal da Câmara dos Deputados",
1242 | preview_title="Câmara aprova projeto que aumenta penas para crimes sexuais contra crianças; acompanhe - Notícias",
1243 | preview_description=None,
1244 | )
1245 | assert result[0] == expected
1246 |
1247 |
1248 | def test_parse_video_big():
1249 | html = """
1250 |
1289 | """
1290 | tree = document_fromstring(html)
1291 | result = list(parse_messages(original_url, tree))
1292 | expected = ChannelMessage(
1293 | id=5343,
1294 | created_at=datetime.datetime(
1295 | 2023, 1, 30, 23, 30, 40, tzinfo=datetime.timezone.utc
1296 | ),
1297 | type="video",
1298 | channel="some_random_user",
1299 | author=None,
1300 | edited=False,
1301 | urls=[
1302 | (
1303 | "thumbnail-video",
1304 | "https://cdn1.telegram-cdn.org/file/HyfobRUDoAxInyfbDVXk1q13pW97NtZN0TYuraxfsJLPps5R14DRXfT1DN2qZkAg4UIpu4RE0frU5LJLK3Y9oWyDgw3Y-Jg92EghLSe0Wmb6dMqCFwoz2CISl_hAeNgDksSQ5i_feURS-NzwXJRHeHwqz1funqsyNkdC5irHPSglylhsf3ZdEZMA4b1XqYrU3Zz4IgT3pLz0HcPSHcxHSyRAGjmb6vFbUYr-qThcvTD7HGXdb6gVRhV3bQyzo8xS9d2Vxho2j735Rxnr5qaprcef883AEuYuDSD31Anok_s6cvOz0jurCG8jbtonkSN9omsub5oyLrvT0H2ZQ4Gx3g",
1305 | ),
1306 | ],
1307 | text="some text",
1308 | views=436600,
1309 | )
1310 | assert result[0] == expected
1311 |
1312 |
1313 | def test_parse_video_single():
1314 | html = """
1315 |
1357 | """
1358 | tree = document_fromstring(html)
1359 | result = list(parse_messages(original_url, tree))
1360 | expected = ChannelMessage(
1361 | id=18,
1362 | created_at=datetime.datetime(
1363 | 2023, 2, 24, 7, 36, 49, tzinfo=datetime.timezone.utc
1364 | ),
1365 | type="video",
1366 | channel="tchantest",
1367 | author=None,
1368 | edited=False,
1369 | urls=[
1370 | (
1371 | "video",
1372 | "https://cdn1.telegram-cdn.org/file/61dc623809.mp4?token=K5rf-tYZuK-hwjpm2Aa8NLNZI54LnZSlUftpmFaAjXdqeUtO0xfykZx8zfupPs7TYNYHMTDKJcmfoDs66q2bTsAk0d1kwrwZrArAGYOWBnJmTDualrn2gIQ8_VPKEQccf_k_sGyzGcbsCs9rkdlOTWMUQXOxoKDQ12X2hdnPONoehz0KAR_nH-W0lGKGUuxbUhu8yFVZGuu9JLFSp5dLprKNy9HYAJeh8D7yM3lZ7GmIG3ck5GLdjeYzRFx96sn0NviOl8H7kHcoDOCv_CkppMpXWninC8yboJUsmPISjfwQOiDD9ZOE2wZb7TN0zYSAHDFEy0Ga1msxs6f1j7Wjyw",
1373 | ),
1374 | (
1375 | "thumbnail-video",
1376 | "https://cdn1.telegram-cdn.org/file/WiZBMqAIrM0sjmV7tKxjJQ3BcyP04k0bU3i1_kCI_cipeYNV2EgXyUIZ8Jii-7BMgdpGH9HKFCY8NuzsnvpYToh3PJdSdd4aOwSrYvu8uGQgogharMY-8IjAdcFNQh0stJ7r1-3mjJCT-1SojXo1LOBCt-sX7PI9woHjvqPFDDJyv9-xNbEgwWYMuKUyCA6Z1aKsEAz0wIDzLil4IXze3_neSQRkSlqWUnCGV_JoPy-qQuJd_6do_AnJMaLwlDcFBYAzZ-sX1kJC03qWhtjHUy9uaG9j8z23C_RcSlhWPTYPQi5t0x1HQXRWm-kaASMrghIIg9HFSu4MdCLOb8IgIA",
1377 | ),
1378 | ],
1379 | text="Video by telegram camera",
1380 | views=2,
1381 | )
1382 | assert result[0] == expected
1383 |
1384 |
1385 | def test_parse_video_single_2():
1386 | html = """
1387 |
1426 | """
1427 | tree = document_fromstring(html)
1428 | result = list(parse_messages(original_url, tree))
1429 | expected = ChannelMessage(
1430 | id=19,
1431 | created_at=datetime.datetime(
1432 | 2023, 2, 24, 7, 38, 13, tzinfo=datetime.timezone.utc
1433 | ),
1434 | type="video",
1435 | channel="tchantest",
1436 | author=None,
1437 | edited=False,
1438 | urls=[
1439 | (
1440 | "video",
1441 | "https://cdn1.telegram-cdn.org/file/bd7ceb0b41.mp4?token=FqRn4LGTlVsYQNCiPqd19TF0Z-EK6gWw__1F7os3qthEAdi8kkkrrWeUqJYCrNsv7OF80hlKUHUfKi_9PMsFbg28A8zONLdULTjmt256uaIBQqc3ieY_Js1kg8ezAIi5Y2tab_Z9dycZBrPQjARRoTVwLd9v_QrIQ7il3W1iXipnClgLZK4PxFjFOIhdYmEM_gAhxylo30BdpQFekkk5xCwNNTKmFjeaZTUQ_NNM1k3NGuKQo3SmrHUj4hK1xoKayOZxvB0oJIffTpMkHEBjs_j-tyi72uKmRz7my8ogBY0pHO-2Em5lDh-Li0rDGfHJ_NUssmwGqoToH-el_FZfZWA7juyZs_QNSkWABs0ijiVXDKZPKZKiOVQ0Uh8RtNs4UrNKp91GAVLLCb-C9TpW9SRlgMECG4yKWnXZ7nIsh_nhRNMk-aL0_Lehl0D4bRi1xMkXqtCu5B9pMS7MhIoNbg1sK7_4SJi3YRLugH_jiOGx3HhOP9HtBQZZTBZffb_d3vQQHNfXWEYUwP3t8iBwDrFL0qPwlT_1xMM57S6JlS3ApAEqqYFRxD6YBBFcJr3HMb2rinmL77iwCPtOirSeXFNaG0daSIaUn2jOMoDVH-MwVMBr6Oz_ESOWlqiGtHngUxKxgsbC9UxgvzcXXzHNelvl0z8isMf6oz0QqAntnWA",
1442 | ),
1443 | (
1444 | "thumbnail-video",
1445 | "https://cdn1.telegram-cdn.org/file/nWyXAzTHX19MAA8oKaS4tcW-afBU7oOvCpQfVX-Z8l5dVuBeBUDZ5wVn0LRN5OIkwxbDAoSVuJiVm8BPt__9PML4K9rBmVcinjrLUtbNcqhx1WF-JpR_jd5FGQXQStSPYwgYyf460aoQsrVu48r-h4UGyuRLcnNoXytG-siQ_kOxpMArdeZXIQRzNW9eWHfO-IaIx78jqLdhcUaueZjkix91Ak35tPDdloQ7vhzDGqtpJQ1bI6YWVVP0gAm7AQdqcmkoNfFwXjh4uvxEAvsJxdSfMNJ3A1uKhqkm-_fVcbUBgBcJoPIjBBpJx5A3yorSMoMFCvne1QIvFdfjNbGZbg",
1446 | ),
1447 | ],
1448 | text=None,
1449 | views=2,
1450 | )
1451 | assert result[0] == expected
1452 |
1453 |
1454 | def test_parse_round_video_single():
1455 | html = """
1456 |
1504 | """
1505 | tree = document_fromstring(html)
1506 | result = list(parse_messages(original_url, tree))
1507 | expected = ChannelMessage(
1508 | id=17,
1509 | created_at=datetime.datetime(
1510 | 2023, 2, 24, 7, 35, 45, tzinfo=datetime.timezone.utc
1511 | ),
1512 | type="round-video",
1513 | channel="tchantest",
1514 | author=None,
1515 | edited=False,
1516 | urls=[
1517 | (
1518 | "round-video",
1519 | "https://cdn1.telegram-cdn.org/file/68145c80a1.mp4?token=Asgm3ihYDp5WkmfxRXtMximu8cGDQy4Y2UIeZ3JmSNpRz-oDSsXbORc8H4V4oaR4LkaIEFfPz82hrRD1Fvu2wNDljkcCVbxwa0D5iD_sfhWKTDK3Hfy1Fu1hQPZG0b4-1rEkNIfRJp8T_H0nW6Rej80Nl8I8xZAINfHAe0ibS6Qs5R4IudWG4ULL3NRmJLzGDM92YlqVlkEzwvLB9au1G0jczynEbJ7qn5npIxQYPPIBzZehLtZHAXqp_cnq6moGUL6mAfK1tOPqN13wVfizhtv2XHy8WqvaEVxZMTWFyMu64nPc1aB1EGsfvdgSdpV-pENMcsH5ihLEbwfCTbQPFw",
1520 | ),
1521 | (
1522 | "thumbnail-roundvideo",
1523 | "https://cdn1.telegram-cdn.org/file/U9Z-cXN1H4Y4JdCGNLPCaw8Y_4idAGdFDGqOMSS20fsGBN0OWrzYo-rtvGIgex8IkoAGGjqz2DduF369l5kY_jGB1zd7NpE9rqgSnB1hhUVFmxWqhP3bzsyTjJRABWJns176vCa_Jn8CEjHFddu39ONkG4Hyqc7wGDA6eOOMFF2pATNxePd-Jg056jTn79In9byN7cKk5Rlkt2hkY02vlaee_eokRInwNRQShvJ59Xdv0gEQV27DpSbjFvc4Ci_66unLiu3aWWA50etg5CbB2nmtSfLSs7ujGBteh5Aw9z0HjB4BVw3eNbZVrDKBUTe_5eFw6XE4RKwRnJ8LL4TPuA",
1524 | ),
1525 | ],
1526 | text=None,
1527 | views=2,
1528 | )
1529 | assert result[0] == expected
1530 |
1531 |
1532 | def test_parse_photo_multiple():
1533 | html = """
1534 |
1580 | """
1581 | tree = document_fromstring(html)
1582 | result = list(parse_messages(original_url, tree))
1583 | expected = ChannelMessage(
1584 | id=14,
1585 | created_at=datetime.datetime(
1586 | 2023, 2, 24, 7, 34, 20, tzinfo=datetime.timezone.utc
1587 | ),
1588 | type="photo",
1589 | channel="tchantest",
1590 | author=None,
1591 | edited=False,
1592 | urls=[
1593 | (
1594 | "photo",
1595 | "https://cdn1.telegram-cdn.org/file/QMvda1Z9IET0DA4-jsfevuguTxnTpn6omRRA7gfRfYs6VCmxjsZdeX4R6k4n7rYe7skLclk-A1G7Dw4UuYzV5Ogj9KkYlLSsJD2x_WQDRqmUDauUWEAGa-JNACCFBH2zRJeFwb6OvAD6itpb05MbvyMVvsR18sw8Qe4VcPvODZmfKGxQ7ioslSsxTKoLz8KEAIcvPqVLkJKjkPVERS6u8QBSS-ZPOWu6RvJsS0_fS7oCTSfLjTmn3EfGlU3BvRAmNcVCQv7Jz-CPNBgXHUk5Bru05WNpsdaoCVfX9PzeDycLIoHm5S55H_TV9zocKvg_ZX0NVyLE_Em_wO7nbDwDRQ.jpg",
1596 | ),
1597 | (
1598 | "photo",
1599 | "https://cdn1.telegram-cdn.org/file/CrsaR3dLCwSaTunI7l4nSUu5G7du7049yHXuZwbiOjAfSDEMrJaKiJ9-ly6RJOJf7wSYPGjpUppSBuqLkTbMIl_CMEpS_9nVLvNusCJTRXbhJbU4UFsTxiM89YyDE_9bapVEjoS9vhRS7qw1zSCbV2K42W3TZvvQ8scfiI2xiMIsRkw-YzpIVxbkzpeWz3-US6fi7DswlIobEgCG0uxmHdr6q2FEFOn9BCpfQHlrDqq8rCA9kBteMinAEkALObzktjJ76PMFWQZbQCcKIofW9oOThEJRAdFrRaho9PwBOQIcrSf_2MQmyqg8zA79k04ME76FsNNw3xr7xA160MHckA.jpg",
1600 | ),
1601 | ],
1602 | text="Multiple pics",
1603 | views=2,
1604 | )
1605 | assert result[0] == expected
1606 |
1607 |
1608 | def test_parse_location_message():
1609 | html = """
1610 |
1640 | """
1641 | tree = document_fromstring(html)
1642 | result = list(parse_messages(original_url, tree))
1643 | expected = ChannelMessage(
1644 | id=10,
1645 | created_at=datetime.datetime(
1646 | 2023, 2, 24, 7, 31, 38, tzinfo=datetime.timezone.utc
1647 | ),
1648 | type="location",
1649 | channel="tchantest",
1650 | author=None,
1651 | edited=False,
1652 | urls=[
1653 | (
1654 | "link",
1655 | "https://maps.google.com/maps?q=-23.930587176407,-44.086305367818&ll=-23.930587176407,-44.086305367818&z=16",
1656 | ),
1657 | ],
1658 | text=None,
1659 | views=2,
1660 | )
1661 | assert result[0] == expected
1662 |
1663 |
1664 | def test_parse_location_message_2():
1665 | html = """
1666 |
1695 | """
1696 | tree = document_fromstring(html)
1697 | result = list(parse_messages(original_url, tree))
1698 | expected = ChannelMessage(
1699 | id=83,
1700 | created_at=datetime.datetime(
1701 | 2023, 2, 24, 11, 1, 22, tzinfo=datetime.timezone.utc
1702 | ),
1703 | type="location",
1704 | channel="tchantest",
1705 | author=None,
1706 | edited=False,
1707 | urls=[
1708 | (
1709 | "link",
1710 | "https://maps.google.com/maps?q=-23.532363086523,-46.689620092745&ll=-23.532363086523,-46.689620092745&z=16",
1711 | ),
1712 | ],
1713 | text=None,
1714 | views=1,
1715 | )
1716 | assert result[0] == expected
1717 |
1718 |
1719 | def test_parse_location_message_3():
1720 | html = """
1721 |
1750 | """
1751 | tree = document_fromstring(html)
1752 | result = list(parse_messages(original_url, tree))
1753 | expected = ChannelMessage(
1754 | id=88,
1755 | created_at=datetime.datetime(
1756 | 2023, 2, 24, 11, 23, 40, tzinfo=datetime.timezone.utc
1757 | ),
1758 | type="location",
1759 | channel="tchantest",
1760 | author=None,
1761 | edited=False,
1762 | urls=[
1763 | (
1764 | "link",
1765 | "https://maps.google.com/maps?q=-23.531972140424,-46.689076113848&ll=-23.531972140424,-46.689076113848&z=16",
1766 | ),
1767 | ],
1768 | text=None,
1769 | views=2,
1770 | )
1771 | assert result[0] == expected
1772 |
1773 |
1774 | def test_parse_audio_message():
1775 | html = """
1776 |
1819 | """
1820 | tree = document_fromstring(html)
1821 | result = list(parse_messages(original_url, tree))
1822 | expected = ChannelMessage(
1823 | id=13,
1824 | created_at=datetime.datetime(
1825 | 2023, 2, 24, 7, 34, 5, tzinfo=datetime.timezone.utc
1826 | ),
1827 | type="audio",
1828 | channel="tchantest",
1829 | author=None,
1830 | edited=False,
1831 | urls=[
1832 | (
1833 | "audio",
1834 | "https://cdn1.telegram-cdn.org/file/29881a3f30.ogg?token=RPh7-yUn9te932hRlzltMAmWTgmmAzD_PswWMJnmCZSfQQ63SfT5jfT_IpSq71gXP0d5F_G3fZLc1mxLuPR_NFSBVncAk7hvT5086hIQJfHX8qoE8VW-714sWoGTqVv6l35yS7V8_hCkt4KD2kW3F_5K2MPJ6yKPHBB4VzvlLmrwUoVVTPPOl2NJypUtDdN2fwFj7TkeZYHu0jrtKGrynmmBOp36SHpks7c9bkkL8HcaHhlzZXCBzibbWh0IM895baESbOimQrxnUwuTE9gv_VHGa_EMiwBD71p_NyHYPfpgNxGi1TPCVzJSO5QVPXDBZ3MSHoYI4COLZMt_4wwx1A",
1835 | ),
1836 | ],
1837 | text=None,
1838 | views=2,
1839 | )
1840 | assert result[0] == expected
1841 |
1842 |
1843 | def test_parse_signed_edited_text_message():
1844 | html = """
1845 |
1872 | """
1873 | tree = document_fromstring(html)
1874 | result = list(parse_messages(original_url, tree))
1875 | expected = ChannelMessage(
1876 | id=5,
1877 | created_at=datetime.datetime(
1878 | 2023, 2, 24, 7, 29, 57, tzinfo=datetime.timezone.utc
1879 | ),
1880 | type="text",
1881 | channel="tchantest",
1882 | urls=[],
1883 | author="Álvaro Justen",
1884 | edited=True,
1885 | text="Hello! Signed and edited message",
1886 | views=2,
1887 | )
1888 | assert result[0] == expected
1889 |
1890 |
1891 | def test_parse_no_posts_found():
1892 | html = """
1893 |
1894 |
1899 |
1900 | """
1901 | tree = document_fromstring(html)
1902 | result = list(parse_messages(original_url, tree))
1903 | expected = []
1904 | assert result == expected
1905 |
--------------------------------------------------------------------------------