├── accless_tg_scraper ├── __init__.py ├── client.py ├── serialize │ ├── classes.py │ └── markdown.py ├── classes.py └── parser.py ├── .gitignore ├── tests ├── download-samples.sh ├── text-entities.py ├── client-test.py ├── parser-test.py └── tg_tests.py ├── setup.py └── README.md /accless_tg_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import * 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | /accless_tg_scraper.egg-info 3 | /build 4 | *.html 5 | reinstall.sh 6 | install-e-mode.sh -------------------------------------------------------------------------------- /tests/download-samples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -f tg-posts*.html tg-single-post*.html 3 | id=0 4 | function add () { 5 | local url=$1 6 | local url_type=$2 7 | id=$(($id + 1)) 8 | if [ $url_type == 1 ]; 9 | then 10 | curl $url -fo tg-posts-$id.html 11 | else 12 | curl "$url?embed=1&mode=tme" -fo tg-single-post-$id.html 13 | fi 14 | } 15 | 16 | add "https://t.me/s/evgenii_ponasenkov" 1 17 | add "https://t.me/evgenii_ponasenkov/7561" 2 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name = 'accless_tg_scraper', 5 | version = '0.2.0', 6 | author = 'Kisspeace', 7 | keywords = 'telegram scraper parser web', 8 | url = 'http://github.com/Kisspeace/accless-tg-scraper', 9 | description = 'Scrap telegram web WITHOUT account or API token', 10 | packages = ['accless_tg_scraper', 11 | 'accless_tg_scraper.serialize'], 12 | install_requires = [ 13 | 'aiohttp', 14 | 'bs4' 15 | ] 16 | ) 17 | -------------------------------------------------------------------------------- /tests/text-entities.py: -------------------------------------------------------------------------------- 1 | from accless_tg_scraper.serialize.markdown import * 2 | from accless_tg_scraper.serialize.classes import * 3 | from accless_tg_scraper.classes import * 4 | 5 | text = 'Hello. md has no spoilers support' 6 | entities = [ 7 | TgMessageEntityItalic(1, 2), 8 | TgMessageEntityBold(1, 2), 9 | TgMessageEntityUrl(0, 1, 'https://youtu.be/HTMDNZOlUq4'), 10 | TgMessageEntityStrikethrough(3, 3), 11 | TgMessageEntityBold(4, 1), 12 | TgMessageEntitySpoiler(7, 26) 13 | ] 14 | 15 | md = dump_content(text, entities, RULE_SET_MD) 16 | print(md) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### accless-tg-scraper 2 | Scrap posts from telegram web WITHOUT account or API token 3 | #### Install 4 | ```shell 5 | python -m pip install "git+https://github.com/Kisspeace/accless-tg-scraper.git#egg=accless-tg-scraper" 6 | ``` 7 | #### Simple example 8 | ```python 9 | import asyncio 10 | from accless_tg_scraper import * 11 | 12 | async def main(): 13 | telegram = TgScraper() 14 | page = await telegram.get_posts_page('evgenii_ponasenkov') 15 | posts = page.posts 16 | 17 | print(f'got {len(posts)} posts.') 18 | for post in posts: 19 | print(f'{post.url}:{post.content}\n') 20 | 21 | asyncio.run(main()) 22 | ``` 23 | -------------------------------------------------------------------------------- /tests/client-test.py: -------------------------------------------------------------------------------- 1 | #!python 2 | import asyncio 3 | from bs4 import BeautifulSoup 4 | from accless_tg_scraper.client import * 5 | from accless_tg_scraper.classes import * 6 | import re 7 | from tg_tests import * 8 | from accless_tg_scraper.serialize.markdown import * 9 | 10 | tg = TgScraper() 11 | last_posts_page = None 12 | 13 | ponasenkov_tg = 'evgenii_ponasenkov' 14 | 15 | async def get_n_print_channel_info(channel: str, *args, **kwargs): 16 | channel = await tg.get_channel_info(channel, *args, **kwargs) 17 | print_channel_info(channel) 18 | print(DELIM) 19 | 20 | async def get_n_print(channel: str, *args, **kwargs): 21 | global last_posts_page 22 | posts_page = await tg.get_posts_page(channel=channel, *args, **kwargs) 23 | last_posts_page = posts_page 24 | print_channel_info(posts_page.channel) 25 | print_posts(posts_page) 26 | 27 | async def main(): 28 | await get_n_print_channel_info(ponasenkov_tg) 29 | await get_n_print(ponasenkov_tg) 30 | await get_n_print(ponasenkov_tg, before=last_posts_page.posts[0].id) 31 | post = await tg.get_post(ponasenkov_tg, 7561) 32 | print_post(post) 33 | print(DELIM) 34 | if __name__ == '__main__': 35 | asyncio.run(main()) -------------------------------------------------------------------------------- /tests/parser-test.py: -------------------------------------------------------------------------------- 1 | #!python 2 | from bs4 import BeautifulSoup 3 | from accless_tg_scraper.parser import * 4 | import re 5 | from tg_tests import * 6 | from accless_tg_scraper.serialize.markdown import * 7 | import os 8 | import fnmatch 9 | 10 | def bs_from_file(filename: str) -> BeautifulSoup: 11 | fp = open(filename) 12 | page = BeautifulSoup(fp, 'html.parser') 13 | return page 14 | 15 | def test_url_parse(url: str): 16 | print(F"{channel_name_from_url(url)} from {url}") 17 | 18 | def test_post_id_parse(url: str): 19 | print(F"{post_id_from_url(url)} from {url}") 20 | 21 | test_url_parse('https://t.me/s/channel_name') 22 | test_url_parse('https://t.me/s/channel_name?after=1030') 23 | test_url_parse('https://t.me/channel_name/752') 24 | test_url_parse('https://t.me/channel_name?someparams=sgduh23847tgdhs') 25 | test_url_parse('https://t.me/channel_name') 26 | test_url_parse('channel_name') 27 | 28 | test_post_id_parse('https://t.me/channel_name/1812') 29 | test_post_id_parse('https://t.me/channel_name/1488?embed=1&mode=tme') 30 | 31 | posts = [] 32 | 33 | def add_post(filename: str) -> TgPost: 34 | global posts 35 | web_page = bs_from_file(filename) 36 | post = parse_widget_post(web_page) 37 | print_post(post) 38 | posts.append(post) 39 | return post 40 | 41 | def add_posts(filename: str) -> list[TgPost]: 42 | global posts 43 | web_page = bs_from_file(filename) 44 | new_posts = parse_posts(web_page) 45 | print_posts(new_posts) 46 | posts = posts + new_posts 47 | 48 | for f in os.listdir(os.curdir): 49 | if fnmatch.fnmatch(f, 'tg-single-post*'): 50 | add_post(f) 51 | elif fnmatch.fnmatch(f, 'tg-posts*'): 52 | add_posts(f) 53 | 54 | # new = [] 55 | # for p in posts: 56 | # if p.has_service_msg(): 57 | # new.append(p) 58 | # print(f'service msg_ {p.service_msg.type} : {p.service_msg.extra}') 59 | # posts = new 60 | 61 | dump_posts(posts, 'dump.md', 'a') 62 | -------------------------------------------------------------------------------- /tests/tg_tests.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from bs4 import BeautifulSoup 3 | from accless_tg_scraper.parser import * 4 | from accless_tg_scraper.client import * 5 | from accless_tg_scraper.classes import * 6 | import re 7 | 8 | DELIM = '' 9 | 10 | def print_channel_info(c: TgChannelInfo): 11 | print(f'{c.display_name} - {c.name} - {c.subscribers} subs, {c.photos} photos, {c.videos} videos, {c.links} links. {c.url}') 12 | print(f'avatar: {c.avatar}') 13 | print(f'desc: {c.description}') 14 | if c.has_preview: 15 | print(f'channel has preview page.') 16 | 17 | def print_post(post: TgPost): 18 | print('(' + post.author.name + ') ' + post.author.url + ' on ' + post.url + ' at ' + str(post.timestamp) + ' with ' + post.views + ' views.') 19 | 20 | if post.has_forward(): 21 | print('📰 forwarded from: ' + post.forwarded_from.name + ' : ' + post.forwarded_from.url) 22 | 23 | if post.has_reply(): 24 | print('✉️ reply: ' + post.reply.author_name + ' : ' + post.reply.url + ' : ' + post.reply.image_url) 25 | print('✉️ reply metatext: ' + post.reply.metatext) 26 | 27 | if post.content != "": 28 | print("Text: " + post.content) 29 | 30 | if post.has_sticker(): 31 | if not post.sticker.animated: 32 | print('🗿 Sticker: ' + post.sticker.image_url) 33 | else: 34 | print('🗿 Animated sticker: ' + post.sticker.video_url + ' ' + post.sticker.image_url) 35 | 36 | if post.has_not_supported: 37 | print('⚠️ Post has not supported media !') 38 | 39 | if post.has_voice(): 40 | print(f'🔊 {post.voice.duration} -> {post.voice.url}') 41 | 42 | if post.has_rounded_video(): 43 | print(f'📹 {post.rounded_video.duration} -> {post.rounded_video.url}\nthumb: {post.rounded_video.thumbnail}') 44 | 45 | if post.has_images(): 46 | for img in post.images: 47 | print('🌉 image: ' + img.url + ' : ' + img.url_single) 48 | 49 | if post.has_videos(): 50 | for vid in post.videos: 51 | print('🎥 video: ' + vid.url + ' : ' + vid.image_url + ' : ' + vid.url_single) 52 | 53 | if post.has_link_previews(): 54 | for link in post.link_previews: 55 | print('🔗 link (' + link.site_name + '): ' + link.url + ' - ' + link.title + ' - ' + link.description) 56 | print('🔗 link thumbnail: ' + link.image_url) 57 | 58 | if post.has_poll(): 59 | print(f"❔: {post.poll.question} with {post.poll.voters} voters:") 60 | i = 0 61 | for opt in post.poll.options: 62 | i += 1 63 | print(f"{i} ) [{opt.percents}%]: {opt.value}") 64 | 65 | if post.has_invoice(): 66 | print(f"💳: {post.invoice.title}: {post.invoice.description}") 67 | 68 | def print_posts(posts: any): 69 | if type(posts) is TgPostsPage: 70 | posts = posts.posts 71 | print('Count: ' + str(len(posts))) 72 | for post in posts: 73 | print_post(post) 74 | print(DELIM) -------------------------------------------------------------------------------- /accless_tg_scraper/client.py: -------------------------------------------------------------------------------- 1 | import asyncio, aiohttp 2 | import copy 3 | from bs4 import BeautifulSoup 4 | from accless_tg_scraper.classes import * 5 | from accless_tg_scraper.parser import * 6 | 7 | class TgScraper(): 8 | def __init__(self): 9 | self.base_url: str = TELEGRAM_WEB_URL 10 | self.timeouts = aiohttp.ClientTimeout(connect=0.6) 11 | self._headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, br', 14 | 'Accept-Language': 'en-US,en;q=0.5', 15 | 'TE': 'trailers', 16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:107.0) Gecko/20100101 Firefox/107.0' 17 | } 18 | 19 | def _url_preview(self, channel_name: str) -> str: 20 | return f"{self.base_url}/s/{channel_name}" 21 | 22 | def _url_post_widget(self, channel: str, post_id: int) -> str: 23 | return f"{self.base_url}/{channel}/{post_id}?embed=1&mode=tme" 24 | 25 | def _bs(self, response) -> BeautifulSoup: 26 | return BeautifulSoup(response, 'html.parser') 27 | 28 | def _new_session(self, *args, **kwargs) -> aiohttp.ClientSession: 29 | return aiohttp.ClientSession(headers=self._headers, timeout=self.timeouts) 30 | 31 | # Setters & getters: 32 | 33 | def set_headers(self, headers: dict): 34 | self._headers = copy.deepcopy(headers) 35 | 36 | def get_headers(self) -> dict: 37 | return copy.deepcopy(self._headers) 38 | 39 | # functions 40 | 41 | async def get_post(self, channel: str, post_id: int) -> TgPost: 42 | res = None 43 | async with self._new_session() as session: 44 | resp = await session.get( 45 | url=self._url_post_widget(channel, post_id)) 46 | text = await resp.text() 47 | return parse_widget_post(self._bs(text)) 48 | 49 | async def get_posts_page(self, channel: str, q: str = '', before = '', after = '', full_url: str = '') -> TgPostsPage: 50 | res = None 51 | params = {} 52 | 53 | if full_url == '': 54 | url = self._url_preview(channel) 55 | 56 | params = { 57 | 'q': str(q), 58 | 'before': str(before), 59 | 'after': str(after) 60 | } 61 | else: 62 | url = full_url 63 | 64 | async with self._new_session() as session: 65 | resp = await session.get( 66 | url=url, 67 | params=params) 68 | text = await resp.text() 69 | 70 | res = parse_posts_page(self._bs(text)) 71 | return res 72 | 73 | async def get_channel_info(self, channel: str) -> TgChannelInfo: 74 | res = None 75 | async with self._new_session() as session: 76 | resp = await session.get(f'{self.base_url}/{channel}') 77 | text = await resp.text() 78 | res = parse_channel_info(self._bs(text)) 79 | return res -------------------------------------------------------------------------------- /accless_tg_scraper/serialize/classes.py: -------------------------------------------------------------------------------- 1 | from accless_tg_scraper.classes import * 2 | from copy import deepcopy 3 | 4 | class TgEntityRuleSet(): 5 | class EntityRule(): 6 | def __init__(self, ent_type: TgMessageEntity, prefix: str, postfix: str): 7 | self.prefix: str = prefix 8 | self.postfix: str = postfix 9 | self.type = ent_type 10 | 11 | def convert(self, entity: TgMessageEntity, source: str): 12 | sub_str = source[entity.offset : entity.offset + entity.length] 13 | return f'{self.prefix}{sub_str}{self.postfix}' 14 | 15 | def __init__(self): 16 | self.bold: self.EntityRule = None 17 | self.italic: self.EntityRule = None 18 | self.strikethrogh: self.EntityRule = None 19 | self.underlined: self.EntityRule = None 20 | self.url: self.EntityRule = None 21 | self.spoiler: self.EntityRule = None 22 | self.emoji: self.EntityRule = None 23 | 24 | def get_rules(self) -> list: 25 | return [ 26 | self.bold, 27 | self.italic, 28 | self.strikethrogh, 29 | self.underlined, 30 | self.url, 31 | self.spoiler, 32 | self.emoji 33 | ] 34 | 35 | def rule_by_type(self, ent_type: TgMessageEntity): 36 | for rule in self.get_rules(): 37 | if rule.type is ent_type: 38 | return rule 39 | return None 40 | 41 | def get_converted(self, entity: TgMessageEntity, source: str): 42 | rule = self.rule_by_type(type(entity)) 43 | return rule.convert(entity, source) 44 | 45 | def dump_content(content: str, entities: list, rule_set: TgEntityRuleSet) -> str: 46 | """ 47 | Args: 48 | content (str): text. 49 | entities (list): list of entities for given text. 50 | rule_set (TgEntityRuleSet): rule set. 51 | Returns: 52 | str: post.content converted to custom format with post.entities. 53 | """ 54 | 55 | if len(entities) < 1: 56 | return content 57 | 58 | entities = deepcopy(entities) 59 | res: str = content 60 | 61 | def replace_with(offset: int, length: int, string: str): 62 | nonlocal res 63 | slice_a = res[0 : offset] 64 | slice_b = res[offset + length : len(res)] 65 | res = slice_a + string + slice_b 66 | 67 | l = len(entities) 68 | for i in range(0, l): 69 | ent = entities[i] 70 | rule = rule_set.rule_by_type(type(ent)) 71 | converted = rule.convert(ent, res) 72 | replace_with(ent.offset, ent.length, converted) 73 | 74 | # now need to remap all next entities. 75 | for n in range(i + 1, l): 76 | next_ent = entities[n] 77 | 78 | # info about next entity. 79 | start_after = next_ent.starts_after(ent) 80 | start_inside = next_ent.starts_inside(ent) 81 | 82 | if start_after: 83 | next_ent.offset += (len(converted) - ent.length) 84 | elif start_inside: 85 | next_ent.offset += len(rule.prefix) 86 | 87 | return res 88 | -------------------------------------------------------------------------------- /accless_tg_scraper/serialize/markdown.py: -------------------------------------------------------------------------------- 1 | from accless_tg_scraper.classes import * 2 | from accless_tg_scraper.serialize.classes import * 3 | from copy import deepcopy 4 | 5 | class TgEntityRuleSetMarkdown(TgEntityRuleSet): 6 | 7 | class EntityRuleUrl(TgEntityRuleSet.EntityRule): 8 | def __init__(self, ent_type=TgMessageEntityUrl, prefix='[', postfix=']'): 9 | TgEntityRuleSet.EntityRule.__init__(self, ent_type=TgMessageEntityUrl, prefix='[', postfix=']') 10 | self.prefix = prefix 11 | self.postfix = postfix 12 | self.url_prefix: str = '(' 13 | self.url_postfix: str = ')' 14 | 15 | def convert(self, entity: TgMessageEntity, source: str): 16 | sub_str = source[entity.offset : entity.offset + entity.length] 17 | return f'{self.prefix}{sub_str}{self.postfix}{self.url_prefix}{entity.url}{self.url_postfix}' 18 | 19 | def __init__(self): 20 | TgEntityRuleSet.__init__(self) 21 | self.bold = self.EntityRule(TgMessageEntityBold, '**', '**') 22 | self.italic = self.EntityRule(TgMessageEntityItalic, '*', '*') 23 | self.strikethrogh = self.EntityRule(TgMessageEntityStrikethrough, '~~', '~~') 24 | self.underlined = self.EntityRule(TgMessageEntityUnderlined, '', '') 25 | self.url = self.EntityRuleUrl() 26 | self.spoiler = self.EntityRule(TgMessageEntitySpoiler, '', '') 27 | self.emoji = self.EntityRule(TgMessageEntityEmoji, '', '') 28 | 29 | # global entity rule set for markdown. 30 | RULE_SET_MD = TgEntityRuleSetMarkdown() 31 | 32 | def dump_posts(posts: list[TgPost], file: any, fmode: str = 'a') -> None: 33 | if isinstance(file, str): 34 | file = open(file, mode=fmode) 35 | opened_localy = True 36 | else: 37 | opened_localy = False 38 | 39 | def out(string: str) -> None: 40 | file.write(string) 41 | 42 | for post in posts: 43 | views_str = f'with {post.views} views.' if post.views != '' else '' 44 | out(f'### [{post.author.display_name}]({post.author.url}): [post]({post.url}) at {post.timestamp} {views_str} \n') 45 | 46 | if post.has_forward(): 47 | out(f'**📰 forwarded from**: [{post.forwarded_from.name}]({post.forwarded_from.url}) \n') 48 | 49 | if post.has_reply(): 50 | out(f'**✉️ reply**: [{post.reply.author_name}]({post.reply.url}) \n') 51 | out(f'**✉️ reply metatext**: {post.reply.metatext} \n') 52 | 53 | if post.content != '': 54 | content = dump_content(post.content, post.entities, RULE_SET_MD) 55 | content = content.replace('\n', ' \n') 56 | out(f'{content} \n') 57 | 58 | if post.has_sticker(): 59 | if not post.sticker.animated: 60 | out(f'[🗿 Sticker]({post.sticker.image_url}) \n') 61 | else: 62 | out(f'[🗿 Sticker]({post.sticker.video_url}) [thumb]({post.sticker.image_url}) \n') 63 | 64 | if post.has_not_supported: 65 | out(f'~~⚠️ Post has not supported media !~~ \n') 66 | 67 | if post.has_voice(): 68 | out(f'[🔊 {post.voice.duration}]({post.voice.url}) \n') 69 | 70 | if post.has_rounded_video(): 71 | out(f'[📹 {post.rounded_video.duration}]({post.rounded_video.url}) \n ![thumbnail]({post.rounded_video.thumbnail}) \n') 72 | 73 | if post.has_documents(): 74 | for doc in post.documents: 75 | emoji = '🎵' if doc.type == TG_DOCUMENT_AUDIO else '💾' 76 | extra = f' - {doc.extra}' if doc.extra != '' else '' 77 | out(f'[{emoji} file]({doc.url}): **{doc.title}**{extra} \n') 78 | 79 | if post.has_images(): 80 | for img in post.images: 81 | out(f'![🌉 image]({img.url}) \n') 82 | 83 | if post.has_videos(): 84 | for vid in post.videos: 85 | if vid.url: 86 | out(f'[🎥 video]({vid.url}) \n') 87 | 88 | if post.has_link_previews(): 89 | for link in post.link_previews: 90 | out(f'[🔗 link ({link.site_name})]({link.url}): {link.title} - {link.description} \n') 91 | out(f'[🔗 link thumbnail]({link.image_url}) \n') 92 | 93 | if post.has_poll(): 94 | out(f'**❔ poll**: {post.poll.question} with {post.poll.voters} voters: \n') 95 | i = 0 96 | for opt in post.poll.options: 97 | i += 1 98 | out(f'{i} ) [{opt.percents}%]: {opt.value} \n') 99 | 100 | if post.has_invoice(): 101 | out(f'**💳 invoice**: {post.invoice.title}: {post.invoice.description} \n') 102 | 103 | out(f'\n') 104 | 105 | if opened_localy: 106 | file.close() 107 | -------------------------------------------------------------------------------- /accless_tg_scraper/classes.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | # Consts 4 | TG_SERVICE_MSG_UNKNOWN = -1 5 | TG_SERVICE_MSG_CHANNEL_CREATED = 0 6 | TG_SERVICE_MSG_CHANNEL_RENAMED = 1 7 | TG_SERVICE_MSG_CHANNEL_PHOTO_REMOVED = 2 8 | TG_SERVICE_MSG_CHANNEL_PHOTO_UPDATED = 3 9 | TG_SERVICE_MSG_LIVE_STREAM_FINISHED = 4 10 | TG_SERVICE_MSG_LIVE_STREAM_SHEDULED = 5 11 | TG_SERVICE_MSG_PINNED = 6 12 | 13 | # Document types: 14 | TG_DOCUMENT_UNKNOWN = -1 15 | TG_DOCUMENT_AUDIO = 0 16 | 17 | class TgChannel(): 18 | def __init__(self): 19 | self.url: str = '' 20 | self.avatar: str = '' 21 | self.name: str = '' 22 | self.display_name: str = '' 23 | 24 | class TgChannelInfo(TgChannel): 25 | def __init__(self): 26 | TgChannel.__init__(self) 27 | self.subscribers: str = '' # like '73.2 k' 28 | self.photos: str = '' 29 | self.videos: str = '' 30 | self.links: str = '' 31 | self.description: str = '' 32 | self.has_preview: bool = None # can be parsed from links like 'https://t.me/channel_name' 33 | 34 | class TgPostVoice(): 35 | def __init__(self): 36 | self.url: str = '' 37 | self.data_waveform: str = '' 38 | self.data_ogg: str = '' # sometimes empty 39 | self.duration: str = '' # like '0:25' 40 | 41 | class TgPostRoundedVideo(): 42 | def __init__(self): 43 | self.url: str = '' 44 | self.thumbnail: str = '' 45 | self.duration: str = '' 46 | 47 | class TgPostImage(): 48 | def __init__(self): 49 | self.url: str = '' 50 | self.url_single: str = '' 51 | 52 | class TgPostInvoice(): 53 | def __init__(self): 54 | self.title: str = '' 55 | self.description: str = '' 56 | 57 | class TgPostVideo(TgPostImage): 58 | def __init__(self): 59 | TgPostImage.__init__(self) 60 | self.image_url: str = '' # thumbnail 61 | 62 | class TgPostReply(): 63 | def __init__(self): 64 | self.author_name: str = '' 65 | self.url: str = '' 66 | self.image_url = '' 67 | self.metatext: str = '' 68 | 69 | class TgPostLinkPreview(): 70 | def __init__(self): 71 | self.site_name: str = '' # like 'YouTube' 72 | self.url: str = '' 73 | self.title: str = '' 74 | self.description: str = '' 75 | self.image_url: str = '' 76 | 77 | class TgSticker(): 78 | def __init__(self): 79 | self.animated: bool = False 80 | self.image_url: str = "" 81 | self.video_url: str = "" 82 | 83 | class TgPoll(): 84 | 85 | class TgPollOption(): 86 | def __init__(self): 87 | self.value: str = '' 88 | self.percents: int = -1 89 | 90 | def __init__(self): 91 | self.type: str = '' # Like 'Anonymous poll' 92 | self.question: str = '' 93 | self.options = [] # list of TgPollOption 94 | self.voters: str = '' # like '32.3k' 95 | 96 | class TgEmoji(): 97 | """Telegram emoji. 98 | id: Emoji id. 99 | custom: True if its a custom emoji. 100 | animated: True if its animated emoji. 101 | image_url: Original representation of emoji (also available for a custom). 102 | custom_image_url: Custom representation of emoji. 103 | data: Image svg+xml data. 104 | tgs_url: link on .tgs file. 105 | 106 | """ 107 | def __init__(self): 108 | self.id: int = -1 # Emoji id. 109 | self.custom: bool = False 110 | self.animated: bool = False 111 | self.image_url: str = '' 112 | self.custom_image_url: str = '' 113 | self.data: str = '' # Image data as text. 114 | self.tgs_url = '' # link on .tgs file. 115 | 116 | class TgMessageEntity(): 117 | """Base class for all message entities. 118 | See: https://core.telegram.org/api/entities 119 | 120 | offset: Offset in string. 121 | length: Characters count. 122 | """ 123 | def __init__(self, offset: int, length: int): 124 | self.offset = offset # Offset in string 125 | self.length = length # Characters count 126 | 127 | def same_place(self, entity) -> bool: 128 | return (self.offset == entity.offset) and (self.length == entity.length) 129 | 130 | def starts_after(self, entity) -> bool: 131 | """ 132 | Returns: 133 | bool: True if current entity start position is bigger than end position of given entity. 134 | """ 135 | return (self.offset >= (entity.offset + entity.length)) 136 | 137 | def starts_inside(self, entity) -> bool: 138 | """ 139 | Returns: 140 | bool: True if current entity starts inside given entity. 141 | """ 142 | return (not self.starts_after(entity)) and (self.offset >= entity.offset) 143 | 144 | class TgMessageEntityUrl(TgMessageEntity): 145 | """Message entity with text and url behind the text. 146 | """ 147 | def __init__(self, offset: int = 0, length: int = 0, url: str = ''): 148 | TgMessageEntity.__init__(self, offset, length) 149 | # self.text: str = text 150 | self.url: str = url 151 | 152 | class TgMessageEntityEmoji(TgMessageEntity): 153 | """Message entity with telegram emoji. 154 | """ 155 | def __init__(self, offset: int = 0, length: int = 0): 156 | TgMessageEntity.__init__(self, offset, length) 157 | self.emoji: TgEmoji = None 158 | 159 | class TgMessageEntityBold(TgMessageEntity): 160 | """Message entity with bold text. 161 | """ 162 | def __init__(self, offset: int = 0, length: int = 0): 163 | TgMessageEntity.__init__(self, offset, length) 164 | 165 | class TgMessageEntityItalic(TgMessageEntity): 166 | """Message entity with italic text. 167 | """ 168 | def __init__(self, offset: int = 0, length: int = 0): 169 | TgMessageEntity.__init__(self, offset, length) 170 | 171 | class TgMessageEntityStrikethrough(TgMessageEntity): 172 | """Message entity with Strikethrough text. 173 | """ 174 | def __init__(self, offset: int = 0, length: int = 0): 175 | TgMessageEntity.__init__(self, offset, length) 176 | 177 | class TgMessageEntityUnderlined(TgMessageEntity): 178 | """Message entity with underlined text. 179 | """ 180 | def __init__(self, offset: int = 0, length: int = 0): 181 | TgMessageEntity.__init__(self, offset, length) 182 | 183 | class TgMessageEntitySpoiler(TgMessageEntity): 184 | """Message entity with hidden text. 185 | """ 186 | def __init__(self, offset: int = 0, length: int = 0): 187 | TgMessageEntity.__init__(self, offset, length) 188 | 189 | class TgServiceMessage(): 190 | def __init__(self): 191 | self.type: int = TG_SERVICE_MSG_UNKNOWN 192 | self.extra: str = '' # (url, text) depends on type 193 | 194 | class TgDocument(): 195 | def __init__(self): 196 | self.type: int = TG_DOCUMENT_UNKNOWN 197 | self.url: str = '' 198 | self.title: str = '' 199 | self.extra: str = '' 200 | 201 | class TgPost(): 202 | def __init__(self): 203 | self.url: str = '' 204 | self.id: int = -1 205 | # self.type: int = TG_MESSAGE 206 | self.content: str = '' 207 | self.entities: list[TgMessageEntity] = [] 208 | self.timestamp: datetime = datetime.now() 209 | self.author: TgChannel = TgChannel() 210 | self.views: str = '' # like '1.8k' 211 | self.images: list[TgPostImage] = [] 212 | self.videos: list[TgPostVideo] = [] 213 | self.documents: list[TgDocument] = [] # list of attached files 214 | self.voice: TgPostVoice = None 215 | self.rounded_video: TgPostRoundedVideo = None 216 | self.link_previews: list[TgPostLinkPreview] = [] 217 | self.has_not_supported: bool = False # Media is too big : VIEW IN TELEGRAM 218 | self.forwarded_from: TgChannel = None 219 | self.reply: TgPostReply = None 220 | self.sticker: TgSticker = None 221 | self.poll: TgPoll = None 222 | self.invoice: TgPostInvoice = None 223 | self.service_msg: TgServiceMessage = None 224 | 225 | def has_service_msg(self) -> bool: 226 | return self.service_msg != None 227 | 228 | def has_forward(self) -> bool: 229 | return self.forwarded_from != None 230 | 231 | def has_reply(self) -> bool: 232 | return self.reply != None 233 | 234 | def has_sticker(self) -> bool: 235 | return self.sticker != None 236 | 237 | def has_voice(self) -> bool: 238 | return self.voice != None 239 | 240 | def has_rounded_video(self) -> bool: 241 | return self.rounded_video != None 242 | 243 | def has_images(self) -> bool: 244 | return len(self.images) > 0 245 | 246 | def has_videos(self) -> bool: 247 | return len(self.videos) > 0 248 | 249 | def has_entities(self) -> bool: 250 | return len(self.entities) > 0 251 | 252 | def has_link_previews(self) -> bool: 253 | return len(self.link_previews) > 0 254 | 255 | def has_documents(self) -> bool: 256 | return len(self.documents) > 0 257 | 258 | def has_poll(self) -> bool: 259 | return self.poll != None 260 | 261 | def has_invoice(self) -> bool: 262 | return self.invoice != None 263 | 264 | class TgPostsPage(): 265 | def __init__(self): 266 | self.posts: list[TgPost] = [] 267 | self.channel = TgChannelInfo() # channel info from right column on web page 268 | -------------------------------------------------------------------------------- /accless_tg_scraper/parser.py: -------------------------------------------------------------------------------- 1 | from accless_tg_scraper.classes import * 2 | from bs4 import BeautifulSoup 3 | from datetime import datetime 4 | from typing import Tuple 5 | import re 6 | 7 | TELEGRAM_WEB_URL = 'https://t.me' 8 | 9 | def channel_name_from_url(url: str, base_url: str = TELEGRAM_WEB_URL+'/') -> str: 10 | base = url.find(base_url) 11 | if base != -1: 12 | res = url[base+len(base_url):] 13 | snslash = res[:2] 14 | res = res[2:] if snslash == 's/' else res 15 | last = res.rfind('/') 16 | last = res.rfind('?') if last == -1 else last 17 | res = res[:last] if last != -1 else res 18 | return res 19 | else: 20 | return url 21 | 22 | def post_id_from_url(url: str, base_url: str = TELEGRAM_WEB_URL+'/') -> int: 23 | base_end = url.find(base_url) 24 | if base_end != -1: 25 | base_end += len(base_url) 26 | 27 | next_slash = url.find('/', base_end+1) 28 | if next_slash != -1: 29 | res = url[next_slash+1:] 30 | params_sign = res.find('?') 31 | if params_sign != -1: 32 | res = res[:params_sign] 33 | return res 34 | return None # default 35 | 36 | def parse_bg_image_url(style_str: str) -> str: 37 | return re.search("background-image:url\('(.*?)'\)", style_str).group(1) 38 | 39 | def parse_emoji(element: BeautifulSoup) -> TgEmoji: 40 | res = TgEmoji() 41 | if 'emoji-id' in element.attrs: 42 | res.id = element['emoji-id'] # Emoji id. 43 | 44 | tg_emoji_wrap = element.find(class_='tg-emoji-wrap') 45 | if tg_emoji_wrap is not None: 46 | 47 | tg_emoji: BeautifulSoup = tg_emoji_wrap.find(class_='tg-emoji') 48 | if tg_emoji is not None: 49 | res.custom = True 50 | res.custom_image_url = tg_emoji['data-webp'] 51 | 52 | if res.custom_image_url in ('', None): 53 | # tgs file url. 54 | source_tgs = tg_emoji.find(attrs={'type': 'application/x-tgsticker'}) 55 | res.tgs_url = source_tgs['srcset'] if source_tgs is not None else '' 56 | 57 | # svg+xml data. 58 | source_xml = tg_emoji.find(attrs={'type': 'image/svg+xml'}) 59 | res.data = source_xml['srcset'] if source_xml is not None else '' 60 | 61 | # Original enoji image url. 62 | emoji = element.find(class_='emoji') 63 | if emoji is not None: 64 | res.image_url = parse_bg_image_url(emoji['style']) 65 | 66 | return res 67 | 68 | def parse_text_with_entities(element: BeautifulSoup) -> Tuple[str, list[TgMessageEntity]]: 69 | """ 70 | Args: 71 | element (BeautifulSoup): element for parse text from. 72 | 73 | Returns: 74 | Tuple: first item is a full text, second item is a list of entities. 75 | """ 76 | FIX_ISSUES = True 77 | entities = [] 78 | full_text: str = '' 79 | 80 | def create_entity(subject: BeautifulSoup) -> TgMessageEntity: 81 | res = None 82 | if 'class' in subject.attrs: 83 | l_classes = subject.attrs['class'] 84 | else: 85 | l_classes = [] 86 | 87 | if (subject.name == 'tg-emoji') or ('emoji' in l_classes): # Telegram emoji. 88 | res = TgMessageEntityEmoji() 89 | res.emoji = parse_emoji(subject) 90 | elif (subject.name == 'b') or ('tgme_widget_service_strong_text' in l_classes): # Bold text. 91 | res = TgMessageEntityBold() 92 | elif subject.name == 'i': # Italic text. 93 | res = TgMessageEntityItalic() 94 | elif subject.name == 'a': # Hyperlink or user mention. 95 | res = TgMessageEntityUrl() 96 | res.url = subject['href'] 97 | elif subject.name == 'u': # Underlined text. 98 | res = TgMessageEntityUnderlined() 99 | elif subject.name == 's': # Strikethrough text. 100 | res = TgMessageEntityStrikethrough() 101 | elif subject.name == 'tg-spoiler': 102 | res = TgMessageEntitySpoiler() 103 | # elif subject.name == '': 104 | # pass 105 | return res 106 | 107 | def parse_entities(subject: BeautifulSoup, work_on_br: bool = True): 108 | DISALLOW_EMPTY_ENTITIES = True 109 | 110 | nonlocal full_text 111 | nonlocal entities 112 | 113 | for el in subject: 114 | if el.name is not None: # Is not just a text. 115 | if (el.name == 'br'): 116 | # br tag must break line like on the web-page. 117 | if work_on_br: 118 | full_text += '\n' 119 | else: 120 | # Create entity. 121 | allow_entity = True 122 | current_offset: int = len(full_text) 123 | entity: TgMessageEntity = create_entity(el) 124 | if entity is not None: 125 | entity.offset = current_offset 126 | 127 | parse_entities(el, True) 128 | entity.length = len(full_text) - entity.offset 129 | 130 | # Fixing entities that starts or ends with whitespace. 131 | if FIX_ISSUES: 132 | s = full_text[entity.offset : entity.offset + entity.length] 133 | diff = entity.length - len(s.lstrip()) 134 | rdiff = entity.length - len(s.rstrip()) 135 | entity.offset += diff 136 | entity.length -= (diff + rdiff) 137 | 138 | if DISALLOW_EMPTY_ENTITIES: 139 | if entity.length < 1: 140 | allow_entity = False 141 | 142 | if allow_entity: 143 | entities.append(entity) 144 | else: 145 | parse_entities(el, True) 146 | else: 147 | full_text += el.text 148 | 149 | parse_entities(element) 150 | 151 | if FIX_ISSUES: 152 | stop = len(entities) 153 | for i in range(0, stop): 154 | ent = entities[i] 155 | 156 | # Cleaning entities inside emojis. 157 | if isinstance(ent, TgMessageEntityEmoji): 158 | for n in range(0, stop): 159 | e = entities[n] 160 | if (e is ent) or (e is None): 161 | continue 162 | if ent.same_place(e): 163 | if isinstance(e, TgMessageEntityEmoji): 164 | if ent.emoji.custom: 165 | entities[n] = None 166 | else: 167 | entities[i] = None 168 | break 169 | elif isinstance(e, (TgMessageEntityBold, TgMessageEntityItalic)): 170 | entities[n] = None 171 | 172 | # cleaning null objects. 173 | tmp_entities = [] 174 | for ent in entities: 175 | if ent is not None: 176 | tmp_entities.append(ent) 177 | entities = tmp_entities 178 | 179 | return full_text, entities 180 | 181 | def parse_channel_info(page: BeautifulSoup) -> TgChannelInfo: 182 | res = TgChannelInfo() 183 | tgme_page = page.find(class_='tgme_page') 184 | # Avatar 185 | photo = tgme_page.find(class_='tgme_page_photo') 186 | res.avatar = photo.find('img')['src'] 187 | # Telegram username 188 | res.name = photo.find('a')['href'] 189 | eq_sign = res.name.rfind('=') 190 | res.name = res.name[eq_sign+1:] 191 | # Url 192 | res.url = f'{TELEGRAM_WEB_URL}/{res.name}' 193 | # Display name 194 | res.display_name = tgme_page.find(class_='tgme_page_title').find('span').get_text() 195 | # Subscribers count 196 | extra = tgme_page.find(class_='tgme_page_extra') 197 | if not extra is None: 198 | extra = extra.get_text() 199 | s_pos = extra.find(' s') 200 | res.subscribers = extra[:s_pos] 201 | # Description 202 | desc = tgme_page.find(class_='tgme_page_description') 203 | if not desc is None: 204 | res.description = desc.get_text() 205 | preview_btn = tgme_page.find(class_='tgme_page_context_link') 206 | res.has_preview = (not preview_btn is None) 207 | return res 208 | 209 | def parse_right_column_channel_info(page: BeautifulSoup) -> TgChannelInfo: 210 | res = TgChannelInfo() 211 | res.has_preview = True 212 | tgme_channel_info = page.find(class_="tgme_channel_info") 213 | header = tgme_channel_info.find(class_='tgme_channel_info_header') 214 | # Avatar 215 | photo = header.find(class_='tgme_page_photo_image') 216 | res.avatar = photo.find('img')['src'] 217 | title = header.find(class_='tgme_channel_info_header_title') 218 | # Display name 219 | res.display_name = header.find(class_='tgme_channel_info_header_title').find('span').get_text() 220 | # Url 221 | res.url = header.find(class_='tgme_channel_info_header_username').find('a')['href'] 222 | # Telegram username 223 | res.name = channel_name_from_url(res.url) 224 | # All counters (subscribers, photos, videos, links) 225 | counters = tgme_channel_info.find(class_='tgme_channel_info_counters') 226 | counters = counters.find_all(class_='tgme_channel_info_counter') 227 | for counter in counters: 228 | value = counter.find(class_='counter_value').get_text() 229 | name = counter.find(class_='counter_type').get_text() 230 | setattr(res, name, value) 231 | # Description 232 | desc = tgme_channel_info.find(class_='tgme_channel_info_description') 233 | if not desc is None: 234 | res.description = desc.get_text() 235 | return res 236 | 237 | def parse_post_from_node(p: BeautifulSoup) -> TgPost: 238 | new_post = TgPost() 239 | tgme_widget_message = p.find(class_="tgme_widget_message", recursive=False) 240 | new_post.url = f"{TELEGRAM_WEB_URL}/{tgme_widget_message['data-post']}" 241 | u = new_post.url 242 | new_post.id = int(u[u.rfind('/')+1:]) 243 | 244 | # Author 245 | tgme_widget_message_user = p.find(class_="tgme_widget_message_user") 246 | tgme_widget_message_user_photo = tgme_widget_message_user.find(class_="tgme_widget_message_user_photo") 247 | try: 248 | # Sometimes this url does not exist in the web page. 249 | new_post.author.url = str(tgme_widget_message_user.find("a")["href"]) 250 | except: 251 | pass 252 | 253 | new_post.author.avatar = str(tgme_widget_message_user_photo.find("img")["src"]) 254 | new_post.author.name = channel_name_from_url(new_post.author.url) 255 | 256 | # Author display_name 257 | tgme_widget_message_owner_name = p.find(class_="tgme_widget_message_owner_name") 258 | if not (tgme_widget_message_owner_name is None): 259 | try: 260 | span = tgme_widget_message_owner_name.find('span') 261 | new_post.author.display_name = span.get_text() 262 | except: 263 | pass 264 | 265 | # Text content 266 | tgme_widget_message_text = p.find_all(class_="tgme_widget_message_text") 267 | if len(tgme_widget_message_text) > 0: 268 | if len(tgme_widget_message_text) > 1: 269 | message_text_elem = tgme_widget_message_text[1] 270 | else: 271 | message_text_elem = tgme_widget_message_text[0] 272 | new_post.content, new_post.entities = parse_text_with_entities(message_text_elem) 273 | 274 | # Service message 275 | service_message = p.find(class_='service_message') 276 | if service_message is not None: 277 | service_msg = TgServiceMessage() 278 | if new_post.content.startswith('Live stream scheduled'): 279 | service_msg.type = TG_SERVICE_MSG_LIVE_STREAM_SHEDULED 280 | elif new_post.content.startswith('Live stream finished'): 281 | service_msg.type = TG_SERVICE_MSG_LIVE_STREAM_FINISHED 282 | elif new_post.content.startswith(f'{new_post.author.display_name} pinned'): 283 | service_msg.type = TG_SERVICE_MSG_PINNED 284 | elif new_post.content.startswith('Channel photo updated'): 285 | service_msg.type = TG_SERVICE_MSG_CHANNEL_PHOTO_UPDATED 286 | service_photo = p.find(class_='tgme_widget_message_service_photo') 287 | if service_photo is not None: 288 | img = service_photo.find('img') 289 | service_msg.extra = img['src'] if img is not None else '' 290 | elif new_post.content.startswith('Channel photo removed'): 291 | service_msg.type = TG_SERVICE_MSG_CHANNEL_PHOTO_REMOVED 292 | elif new_post.content.startswith('Channel name was changed to'): 293 | service_msg.type = TG_SERVICE_MSG_CHANNEL_RENAMED 294 | elif new_post.content.startswith('Channel created'): 295 | service_msg.type = TG_SERVICE_MSG_CHANNEL_CREATED 296 | else: 297 | service_msg.type = TG_SERVICE_MSG_UNKNOWN 298 | 299 | if service_msg.type in (TG_SERVICE_MSG_CHANNEL_RENAMED, TG_SERVICE_MSG_PINNED): 300 | strong_text = service_message.find(class_='tgme_widget_service_strong_text') 301 | if strong_text is not None: 302 | service_msg.extra = strong_text.text 303 | elif service_msg.type in (TG_SERVICE_MSG_LIVE_STREAM_FINISHED, TG_SERVICE_MSG_LIVE_STREAM_SHEDULED): 304 | try: 305 | service_msg.extra = re.search("\((.*?)\)", new_post.content).group(1) 306 | except: 307 | pass 308 | 309 | new_post.service_msg = service_msg 310 | 311 | # Reply info 312 | tgme_widget_message_reply = p.find(class_="tgme_widget_message_reply") 313 | if not tgme_widget_message_reply is None: 314 | new_post.reply = TgPostReply() 315 | new_post.reply.url = tgme_widget_message_reply['href'] 316 | new_post.reply.author_name = tgme_widget_message_reply.find(class_="tgme_widget_message_author_name").get_text() 317 | try: 318 | tgme_widget_message_metatext = tgme_widget_message_reply.find(class_="tgme_widget_message_metatext") 319 | if not (tgme_widget_message_metatext is None): 320 | new_post.reply.metatext = tgme_widget_message_metatext.get_text() 321 | style = tgme_widget_message_reply.find(class_="tgme_widget_message_reply_thumb")['style'] 322 | new_post.reply.image_url = parse_bg_image_url(style) 323 | except: 324 | pass 325 | 326 | # Forwarded from 327 | try: 328 | tgme_widget_message_forwarded_from_name = p.find(class_="tgme_widget_message_forwarded_from_name") 329 | if not tgme_widget_message_forwarded_from_name is None: 330 | new_post.forwarded_from = TgChannel() 331 | new_post.forwarded_from.name = tgme_widget_message_forwarded_from_name.find('span').get_text() 332 | new_post.forwarded_from.url = tgme_widget_message_forwarded_from_name['href'] 333 | except: 334 | pass 335 | 336 | # Rounded video 337 | rounded_vid = p.find(class_="tgme_widget_message_roundvideo_player") 338 | if not (rounded_vid is None): 339 | new_post.rounded_video = TgPostRoundedVideo() 340 | thumb = rounded_vid.find(class_='tgme_widget_message_roundvideo_thumb') 341 | new_post.rounded_video.thumbnail = parse_bg_image_url(thumb['style']) 342 | vid = rounded_vid.find(class_='tgme_widget_message_roundvideo') 343 | new_post.rounded_video.url = vid['src'] 344 | duration = rounded_vid.find(class_='tgme_widget_message_roundvideo_duration') 345 | new_post.rounded_video.duration = duration.get_text() 346 | 347 | # Voice 348 | voice_player = p.find(class_='tgme_widget_message_voice_player') 349 | if not (voice_player is None): 350 | voice = voice_player.find(class_="tgme_widget_message_voice") 351 | if not (voice is None): 352 | new_post.voice = TgPostVoice() 353 | new_post.voice.url = voice['src'] 354 | try: 355 | duration = voice_player.find(class_="tgme_widget_message_voice_duration") 356 | new_post.voice.duration = duration.get_text() 357 | new_post.voice.data_waveform = voice['data-waveform'] 358 | new_post.voice.data_ogg = voice['data-ogg'] 359 | except: 360 | pass 361 | 362 | # Images 363 | images = p.find_all(class_="tgme_widget_message_photo_wrap") 364 | for image in images: 365 | new_image = TgPostImage() 366 | style = image["style"] 367 | new_image.url = parse_bg_image_url(style) 368 | new_image.url_single = image["href"] 369 | new_post.images.append(new_image) 370 | 371 | # Supported videos 372 | videos = p.find_all(class_="tgme_widget_message_video_player") 373 | for vid in videos: 374 | new_video = TgPostVideo() 375 | style = '' 376 | thumb = vid.find(class_="tgme_widget_message_video_thumb") 377 | if thumb is not None and 'style' in thumb.attrs: 378 | style = thumb['style'] 379 | 380 | try: 381 | new_video.image_url = parse_bg_image_url(style) 382 | except: 383 | pass 384 | 385 | try: 386 | new_video.url = vid.find(class_="tgme_widget_message_video")['src'] 387 | except: 388 | pass 389 | 390 | new_video.url_single = vid['href'] 391 | new_post.videos.append(new_video) 392 | 393 | # Link previews 394 | link_previews = p.find_all(class_="tgme_widget_message_link_preview") 395 | for prev in link_previews: 396 | new_prev = TgPostLinkPreview() 397 | new_prev.url = prev["href"] 398 | 399 | try: 400 | thumb = prev.find(class_="link_preview_image") 401 | if thumb is None: 402 | thumb = prev.find(class_="link_preview_right_image") 403 | if not thumb is None: 404 | style = thumb['style'] 405 | new_prev.image_url = parse_bg_image_url(style) 406 | 407 | new_prev.title = prev.find(class_="link_preview_title").get_text() 408 | new_prev.description = prev.find(class_="link_preview_description").get_text() 409 | except: 410 | pass 411 | 412 | link_preview_site_name = prev.find(class_='link_preview_site_name') 413 | if link_preview_site_name is not None: 414 | new_prev.site_name = link_preview_site_name.get_text() 415 | new_post.link_previews.append(new_prev) 416 | 417 | # Documents 418 | docs = p.find_all(class_='tgme_widget_message_document_wrap', recursive=True) 419 | for doc in docs: 420 | new_doc = TgDocument() 421 | tmp_obj = doc.find(class_='audio') 422 | if tmp_obj is not None: 423 | new_doc.type = TG_DOCUMENT_AUDIO 424 | else: 425 | new_doc.type = TG_DOCUMENT_UNKNOWN 426 | 427 | new_doc.url = doc['href'] if 'href' in doc.attrs else '' 428 | 429 | title = doc.find(class_='tgme_widget_message_document_title') 430 | new_doc.title = title.text if title is not None else '' 431 | 432 | extra = doc.find(class_='tgme_widget_message_document_extra') 433 | new_doc.extra = extra.text if extra is not None else '' 434 | 435 | new_post.documents.append(new_doc) 436 | 437 | # Views 438 | tgme_widget_message_views = p.find(class_="tgme_widget_message_views") 439 | if not tgme_widget_message_views is None: 440 | new_post.views = str(tgme_widget_message_views.get_text()) 441 | 442 | # Timestamp 443 | tgme_widget_message_date = p.find(class_="tgme_widget_message_date") 444 | time = tgme_widget_message_date.find("time") 445 | new_post.timestamp = datetime.fromisoformat(time["datetime"]) 446 | 447 | # Sticker 448 | tgme_widget_message_sticker_wrap = p.find(class_='tgme_widget_message_sticker_wrap') 449 | if not tgme_widget_message_sticker_wrap is None: 450 | new_post.sticker = TgSticker() 451 | 452 | # static sticker 453 | tgme_widget_message_sticker = p.find(class_='tgme_widget_message_sticker') 454 | if not tgme_widget_message_sticker is None: 455 | if 'data-webp' in tgme_widget_message_sticker.attrs: 456 | new_post.sticker.image_url = tgme_widget_message_sticker['data-webp'] 457 | elif 'style' in tgme_widget_message_sticker.attrs: 458 | new_post.sticker.image_url = parse_bg_image_url(tgme_widget_message_sticker['style']) 459 | 460 | # Animated sticker 461 | tgme_widget_message_videosticker = p.find(class_='tgme_widget_message_videosticker') 462 | if not tgme_widget_message_videosticker is None: 463 | new_post.sticker.animated = True 464 | 465 | js_videosticker_video = tgme_widget_message_videosticker.find(class_='js-videosticker_video') 466 | if js_videosticker_video is not None: 467 | new_post.sticker.video_url = js_videosticker_video['src'] 468 | 469 | webm_sticker_done = js_videosticker_video.find(class_='webm_sticker_done') 470 | if webm_sticker_done is not None: 471 | new_post.sticker.image_url = webm_sticker_done['src'] 472 | 473 | if new_post.sticker.image_url == '': 474 | img = js_videosticker_video.find('img') 475 | if img is not None: 476 | new_post.sticker.image_url = img['src'] 477 | 478 | # Detect unsupported media 479 | message_media_not_supported_wrap = p.find(class_="message_media_not_supported_wrap") 480 | if not (message_media_not_supported_wrap is None): 481 | message_media_not_supported_label = message_media_not_supported_wrap.find(class_="message_media_not_supported_label") 482 | if not (message_media_not_supported_label is None): 483 | not_support_msg = message_media_not_supported_label.get_text() 484 | new_post.has_not_supported = (not_support_msg.find('in your browser') == -1) 485 | else: 486 | new_post.has_not_supported = True 487 | 488 | # Poll 489 | tgme_widget_message_poll = p.find(class_="tgme_widget_message_poll") 490 | if not (tgme_widget_message_poll is None): 491 | try: 492 | new_post.poll = TgPoll() 493 | 494 | # Question 495 | question = tgme_widget_message_poll.find(class_='tgme_widget_message_poll_question') 496 | new_post.poll.question = question.get_text() 497 | 498 | # Poll type 499 | tgme_widget_message_poll_type = tgme_widget_message_poll.find(class_="tgme_widget_message_poll_type") 500 | new_post.poll.type = tgme_widget_message_poll_type.get_text() 501 | 502 | # Options 503 | tgme_widget_message_poll_options = tgme_widget_message_poll.find(class_='tgme_widget_message_poll_options') 504 | options = tgme_widget_message_poll_options.find_all(class_='tgme_widget_message_poll_option') 505 | for opt in options: 506 | new_opt = TgPoll.TgPollOption() 507 | new_opt.value = opt.find(class_="tgme_widget_message_poll_option_value").get_text().strip() 508 | percents = opt.find(class_='tgme_widget_message_poll_option_percent') 509 | if not (percents is None): 510 | percents = percents.get_text() 511 | percents = percents[:len(percents)-1] 512 | new_opt.percents = int(percents) 513 | new_post.poll.options.append(new_opt) 514 | 515 | # Voters count 516 | voters = p.find(class_="tgme_widget_message_voters") 517 | if not (voters is None): 518 | new_post.poll.voters = voters.get_text() 519 | else: 520 | voters = p.find(class_="tgme_widget_message_poll_votes") 521 | voters = voters.get_text() 522 | space_pos = voters.rfind(' ') 523 | if space_pos != -1: 524 | voters = voters[:space_pos-1] 525 | new_post.poll.voters = voters 526 | except: 527 | pass 528 | 529 | # Invoice 530 | tgme_widget_message_invoice = p.find(class_="tgme_widget_message_invoice") 531 | if not (tgme_widget_message_invoice is None): 532 | new_post.invoice = TgPostInvoice() 533 | title = tgme_widget_message_invoice.find(class_='tgme_widget_message_invoice_title') 534 | new_post.invoice.title = title.get_text() 535 | desc = tgme_widget_message_invoice.find(class_='tgme_widget_message_invoice_description') 536 | new_post.invoice.description = desc.get_text() 537 | 538 | return new_post 539 | 540 | def parse_widget_post(page: BeautifulSoup) -> TgPost: 541 | p = page.find(class_="widget_frame_base") 542 | return parse_post_from_node(p) 543 | 544 | def parse_posts(page: BeautifulSoup) -> []: 545 | history = page.find(class_="tgme_channel_history") 546 | p_posts = history.find_all(class_="tgme_widget_message_wrap", recursive=False) 547 | posts = [] 548 | 549 | for p in p_posts: 550 | new_post = parse_post_from_node(p) 551 | posts.append(new_post) 552 | 553 | return posts 554 | 555 | def parse_posts_page(page: BeautifulSoup): 556 | res = TgPostsPage() 557 | res.posts = parse_posts(page) 558 | res.channel = parse_right_column_channel_info(page) 559 | return res 560 | --------------------------------------------------------------------------------