├── accless_tg_scraper
├── __init__.py
├── client.py
├── serialize
│ ├── classes.py
│ └── markdown.py
├── classes.py
└── parser.py
├── .gitignore
├── tests
├── download-samples.sh
├── text-entities.py
├── client-test.py
├── parser-test.py
└── tg_tests.py
├── setup.py
└── README.md
/accless_tg_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import *
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | /accless_tg_scraper.egg-info
3 | /build
4 | *.html
5 | reinstall.sh
6 | install-e-mode.sh
--------------------------------------------------------------------------------
/tests/download-samples.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm -f tg-posts*.html tg-single-post*.html
3 | id=0
4 | function add () {
5 | local url=$1
6 | local url_type=$2
7 | id=$(($id + 1))
8 | if [ $url_type == 1 ];
9 | then
10 | curl $url -fo tg-posts-$id.html
11 | else
12 | curl "$url?embed=1&mode=tme" -fo tg-single-post-$id.html
13 | fi
14 | }
15 |
16 | add "https://t.me/s/evgenii_ponasenkov" 1
17 | add "https://t.me/evgenii_ponasenkov/7561" 2
18 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name = 'accless_tg_scraper',
5 | version = '0.2.0',
6 | author = 'Kisspeace',
7 | keywords = 'telegram scraper parser web',
8 | url = 'http://github.com/Kisspeace/accless-tg-scraper',
9 | description = 'Scrap telegram web WITHOUT account or API token',
10 | packages = ['accless_tg_scraper',
11 | 'accless_tg_scraper.serialize'],
12 | install_requires = [
13 | 'aiohttp',
14 | 'bs4'
15 | ]
16 | )
17 |
--------------------------------------------------------------------------------
/tests/text-entities.py:
--------------------------------------------------------------------------------
1 | from accless_tg_scraper.serialize.markdown import *
2 | from accless_tg_scraper.serialize.classes import *
3 | from accless_tg_scraper.classes import *
4 |
5 | text = 'Hello. md has no spoilers support'
6 | entities = [
7 | TgMessageEntityItalic(1, 2),
8 | TgMessageEntityBold(1, 2),
9 | TgMessageEntityUrl(0, 1, 'https://youtu.be/HTMDNZOlUq4'),
10 | TgMessageEntityStrikethrough(3, 3),
11 | TgMessageEntityBold(4, 1),
12 | TgMessageEntitySpoiler(7, 26)
13 | ]
14 |
15 | md = dump_content(text, entities, RULE_SET_MD)
16 | print(md)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ###
accless-tg-scraper
2 | Scrap posts from telegram web WITHOUT account or API token
3 | #### Install
4 | ```shell
5 | python -m pip install "git+https://github.com/Kisspeace/accless-tg-scraper.git#egg=accless-tg-scraper"
6 | ```
7 | #### Simple example
8 | ```python
9 | import asyncio
10 | from accless_tg_scraper import *
11 |
12 | async def main():
13 | telegram = TgScraper()
14 | page = await telegram.get_posts_page('evgenii_ponasenkov')
15 | posts = page.posts
16 |
17 | print(f'got {len(posts)} posts.')
18 | for post in posts:
19 | print(f'{post.url}:{post.content}\n')
20 |
21 | asyncio.run(main())
22 | ```
23 |
--------------------------------------------------------------------------------
/tests/client-test.py:
--------------------------------------------------------------------------------
1 | #!python
2 | import asyncio
3 | from bs4 import BeautifulSoup
4 | from accless_tg_scraper.client import *
5 | from accless_tg_scraper.classes import *
6 | import re
7 | from tg_tests import *
8 | from accless_tg_scraper.serialize.markdown import *
9 |
10 | tg = TgScraper()
11 | last_posts_page = None
12 |
13 | ponasenkov_tg = 'evgenii_ponasenkov'
14 |
15 | async def get_n_print_channel_info(channel: str, *args, **kwargs):
16 | channel = await tg.get_channel_info(channel, *args, **kwargs)
17 | print_channel_info(channel)
18 | print(DELIM)
19 |
20 | async def get_n_print(channel: str, *args, **kwargs):
21 | global last_posts_page
22 | posts_page = await tg.get_posts_page(channel=channel, *args, **kwargs)
23 | last_posts_page = posts_page
24 | print_channel_info(posts_page.channel)
25 | print_posts(posts_page)
26 |
27 | async def main():
28 | await get_n_print_channel_info(ponasenkov_tg)
29 | await get_n_print(ponasenkov_tg)
30 | await get_n_print(ponasenkov_tg, before=last_posts_page.posts[0].id)
31 | post = await tg.get_post(ponasenkov_tg, 7561)
32 | print_post(post)
33 | print(DELIM)
34 | if __name__ == '__main__':
35 | asyncio.run(main())
--------------------------------------------------------------------------------
/tests/parser-test.py:
--------------------------------------------------------------------------------
1 | #!python
2 | from bs4 import BeautifulSoup
3 | from accless_tg_scraper.parser import *
4 | import re
5 | from tg_tests import *
6 | from accless_tg_scraper.serialize.markdown import *
7 | import os
8 | import fnmatch
9 |
10 | def bs_from_file(filename: str) -> BeautifulSoup:
11 | fp = open(filename)
12 | page = BeautifulSoup(fp, 'html.parser')
13 | return page
14 |
15 | def test_url_parse(url: str):
16 | print(F"{channel_name_from_url(url)} from {url}")
17 |
18 | def test_post_id_parse(url: str):
19 | print(F"{post_id_from_url(url)} from {url}")
20 |
21 | test_url_parse('https://t.me/s/channel_name')
22 | test_url_parse('https://t.me/s/channel_name?after=1030')
23 | test_url_parse('https://t.me/channel_name/752')
24 | test_url_parse('https://t.me/channel_name?someparams=sgduh23847tgdhs')
25 | test_url_parse('https://t.me/channel_name')
26 | test_url_parse('channel_name')
27 |
28 | test_post_id_parse('https://t.me/channel_name/1812')
29 | test_post_id_parse('https://t.me/channel_name/1488?embed=1&mode=tme')
30 |
31 | posts = []
32 |
33 | def add_post(filename: str) -> TgPost:
34 | global posts
35 | web_page = bs_from_file(filename)
36 | post = parse_widget_post(web_page)
37 | print_post(post)
38 | posts.append(post)
39 | return post
40 |
41 | def add_posts(filename: str) -> list[TgPost]:
42 | global posts
43 | web_page = bs_from_file(filename)
44 | new_posts = parse_posts(web_page)
45 | print_posts(new_posts)
46 | posts = posts + new_posts
47 |
48 | for f in os.listdir(os.curdir):
49 | if fnmatch.fnmatch(f, 'tg-single-post*'):
50 | add_post(f)
51 | elif fnmatch.fnmatch(f, 'tg-posts*'):
52 | add_posts(f)
53 |
54 | # new = []
55 | # for p in posts:
56 | # if p.has_service_msg():
57 | # new.append(p)
58 | # print(f'service msg_ {p.service_msg.type} : {p.service_msg.extra}')
59 | # posts = new
60 |
61 | dump_posts(posts, 'dump.md', 'a')
62 |
--------------------------------------------------------------------------------
/tests/tg_tests.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from bs4 import BeautifulSoup
3 | from accless_tg_scraper.parser import *
4 | from accless_tg_scraper.client import *
5 | from accless_tg_scraper.classes import *
6 | import re
7 |
8 | DELIM = ''
9 |
10 | def print_channel_info(c: TgChannelInfo):
11 | print(f'{c.display_name} - {c.name} - {c.subscribers} subs, {c.photos} photos, {c.videos} videos, {c.links} links. {c.url}')
12 | print(f'avatar: {c.avatar}')
13 | print(f'desc: {c.description}')
14 | if c.has_preview:
15 | print(f'channel has preview page.')
16 |
17 | def print_post(post: TgPost):
18 | print('(' + post.author.name + ') ' + post.author.url + ' on ' + post.url + ' at ' + str(post.timestamp) + ' with ' + post.views + ' views.')
19 |
20 | if post.has_forward():
21 | print('📰 forwarded from: ' + post.forwarded_from.name + ' : ' + post.forwarded_from.url)
22 |
23 | if post.has_reply():
24 | print('✉️ reply: ' + post.reply.author_name + ' : ' + post.reply.url + ' : ' + post.reply.image_url)
25 | print('✉️ reply metatext: ' + post.reply.metatext)
26 |
27 | if post.content != "":
28 | print("Text: " + post.content)
29 |
30 | if post.has_sticker():
31 | if not post.sticker.animated:
32 | print('🗿 Sticker: ' + post.sticker.image_url)
33 | else:
34 | print('🗿 Animated sticker: ' + post.sticker.video_url + ' ' + post.sticker.image_url)
35 |
36 | if post.has_not_supported:
37 | print('⚠️ Post has not supported media !')
38 |
39 | if post.has_voice():
40 | print(f'🔊 {post.voice.duration} -> {post.voice.url}')
41 |
42 | if post.has_rounded_video():
43 | print(f'📹 {post.rounded_video.duration} -> {post.rounded_video.url}\nthumb: {post.rounded_video.thumbnail}')
44 |
45 | if post.has_images():
46 | for img in post.images:
47 | print('🌉 image: ' + img.url + ' : ' + img.url_single)
48 |
49 | if post.has_videos():
50 | for vid in post.videos:
51 | print('🎥 video: ' + vid.url + ' : ' + vid.image_url + ' : ' + vid.url_single)
52 |
53 | if post.has_link_previews():
54 | for link in post.link_previews:
55 | print('🔗 link (' + link.site_name + '): ' + link.url + ' - ' + link.title + ' - ' + link.description)
56 | print('🔗 link thumbnail: ' + link.image_url)
57 |
58 | if post.has_poll():
59 | print(f"❔: {post.poll.question} with {post.poll.voters} voters:")
60 | i = 0
61 | for opt in post.poll.options:
62 | i += 1
63 | print(f"{i} ) [{opt.percents}%]: {opt.value}")
64 |
65 | if post.has_invoice():
66 | print(f"💳: {post.invoice.title}: {post.invoice.description}")
67 |
68 | def print_posts(posts: any):
69 | if type(posts) is TgPostsPage:
70 | posts = posts.posts
71 | print('Count: ' + str(len(posts)))
72 | for post in posts:
73 | print_post(post)
74 | print(DELIM)
--------------------------------------------------------------------------------
/accless_tg_scraper/client.py:
--------------------------------------------------------------------------------
1 | import asyncio, aiohttp
2 | import copy
3 | from bs4 import BeautifulSoup
4 | from accless_tg_scraper.classes import *
5 | from accless_tg_scraper.parser import *
6 |
7 | class TgScraper():
8 | def __init__(self):
9 | self.base_url: str = TELEGRAM_WEB_URL
10 | self.timeouts = aiohttp.ClientTimeout(connect=0.6)
11 | self._headers = {
12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
13 | 'Accept-Encoding': 'gzip, deflate, br',
14 | 'Accept-Language': 'en-US,en;q=0.5',
15 | 'TE': 'trailers',
16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:107.0) Gecko/20100101 Firefox/107.0'
17 | }
18 |
19 | def _url_preview(self, channel_name: str) -> str:
20 | return f"{self.base_url}/s/{channel_name}"
21 |
22 | def _url_post_widget(self, channel: str, post_id: int) -> str:
23 | return f"{self.base_url}/{channel}/{post_id}?embed=1&mode=tme"
24 |
25 | def _bs(self, response) -> BeautifulSoup:
26 | return BeautifulSoup(response, 'html.parser')
27 |
28 | def _new_session(self, *args, **kwargs) -> aiohttp.ClientSession:
29 | return aiohttp.ClientSession(headers=self._headers, timeout=self.timeouts)
30 |
31 | # Setters & getters:
32 |
33 | def set_headers(self, headers: dict):
34 | self._headers = copy.deepcopy(headers)
35 |
36 | def get_headers(self) -> dict:
37 | return copy.deepcopy(self._headers)
38 |
39 | # functions
40 |
41 | async def get_post(self, channel: str, post_id: int) -> TgPost:
42 | res = None
43 | async with self._new_session() as session:
44 | resp = await session.get(
45 | url=self._url_post_widget(channel, post_id))
46 | text = await resp.text()
47 | return parse_widget_post(self._bs(text))
48 |
49 | async def get_posts_page(self, channel: str, q: str = '', before = '', after = '', full_url: str = '') -> TgPostsPage:
50 | res = None
51 | params = {}
52 |
53 | if full_url == '':
54 | url = self._url_preview(channel)
55 |
56 | params = {
57 | 'q': str(q),
58 | 'before': str(before),
59 | 'after': str(after)
60 | }
61 | else:
62 | url = full_url
63 |
64 | async with self._new_session() as session:
65 | resp = await session.get(
66 | url=url,
67 | params=params)
68 | text = await resp.text()
69 |
70 | res = parse_posts_page(self._bs(text))
71 | return res
72 |
73 | async def get_channel_info(self, channel: str) -> TgChannelInfo:
74 | res = None
75 | async with self._new_session() as session:
76 | resp = await session.get(f'{self.base_url}/{channel}')
77 | text = await resp.text()
78 | res = parse_channel_info(self._bs(text))
79 | return res
--------------------------------------------------------------------------------
/accless_tg_scraper/serialize/classes.py:
--------------------------------------------------------------------------------
1 | from accless_tg_scraper.classes import *
2 | from copy import deepcopy
3 |
4 | class TgEntityRuleSet():
5 | class EntityRule():
6 | def __init__(self, ent_type: TgMessageEntity, prefix: str, postfix: str):
7 | self.prefix: str = prefix
8 | self.postfix: str = postfix
9 | self.type = ent_type
10 |
11 | def convert(self, entity: TgMessageEntity, source: str):
12 | sub_str = source[entity.offset : entity.offset + entity.length]
13 | return f'{self.prefix}{sub_str}{self.postfix}'
14 |
15 | def __init__(self):
16 | self.bold: self.EntityRule = None
17 | self.italic: self.EntityRule = None
18 | self.strikethrogh: self.EntityRule = None
19 | self.underlined: self.EntityRule = None
20 | self.url: self.EntityRule = None
21 | self.spoiler: self.EntityRule = None
22 | self.emoji: self.EntityRule = None
23 |
24 | def get_rules(self) -> list:
25 | return [
26 | self.bold,
27 | self.italic,
28 | self.strikethrogh,
29 | self.underlined,
30 | self.url,
31 | self.spoiler,
32 | self.emoji
33 | ]
34 |
35 | def rule_by_type(self, ent_type: TgMessageEntity):
36 | for rule in self.get_rules():
37 | if rule.type is ent_type:
38 | return rule
39 | return None
40 |
41 | def get_converted(self, entity: TgMessageEntity, source: str):
42 | rule = self.rule_by_type(type(entity))
43 | return rule.convert(entity, source)
44 |
45 | def dump_content(content: str, entities: list, rule_set: TgEntityRuleSet) -> str:
46 | """
47 | Args:
48 | content (str): text.
49 | entities (list): list of entities for given text.
50 | rule_set (TgEntityRuleSet): rule set.
51 | Returns:
52 | str: post.content converted to custom format with post.entities.
53 | """
54 |
55 | if len(entities) < 1:
56 | return content
57 |
58 | entities = deepcopy(entities)
59 | res: str = content
60 |
61 | def replace_with(offset: int, length: int, string: str):
62 | nonlocal res
63 | slice_a = res[0 : offset]
64 | slice_b = res[offset + length : len(res)]
65 | res = slice_a + string + slice_b
66 |
67 | l = len(entities)
68 | for i in range(0, l):
69 | ent = entities[i]
70 | rule = rule_set.rule_by_type(type(ent))
71 | converted = rule.convert(ent, res)
72 | replace_with(ent.offset, ent.length, converted)
73 |
74 | # now need to remap all next entities.
75 | for n in range(i + 1, l):
76 | next_ent = entities[n]
77 |
78 | # info about next entity.
79 | start_after = next_ent.starts_after(ent)
80 | start_inside = next_ent.starts_inside(ent)
81 |
82 | if start_after:
83 | next_ent.offset += (len(converted) - ent.length)
84 | elif start_inside:
85 | next_ent.offset += len(rule.prefix)
86 |
87 | return res
88 |
--------------------------------------------------------------------------------
/accless_tg_scraper/serialize/markdown.py:
--------------------------------------------------------------------------------
1 | from accless_tg_scraper.classes import *
2 | from accless_tg_scraper.serialize.classes import *
3 | from copy import deepcopy
4 |
5 | class TgEntityRuleSetMarkdown(TgEntityRuleSet):
6 |
7 | class EntityRuleUrl(TgEntityRuleSet.EntityRule):
8 | def __init__(self, ent_type=TgMessageEntityUrl, prefix='[', postfix=']'):
9 | TgEntityRuleSet.EntityRule.__init__(self, ent_type=TgMessageEntityUrl, prefix='[', postfix=']')
10 | self.prefix = prefix
11 | self.postfix = postfix
12 | self.url_prefix: str = '('
13 | self.url_postfix: str = ')'
14 |
15 | def convert(self, entity: TgMessageEntity, source: str):
16 | sub_str = source[entity.offset : entity.offset + entity.length]
17 | return f'{self.prefix}{sub_str}{self.postfix}{self.url_prefix}{entity.url}{self.url_postfix}'
18 |
19 | def __init__(self):
20 | TgEntityRuleSet.__init__(self)
21 | self.bold = self.EntityRule(TgMessageEntityBold, '**', '**')
22 | self.italic = self.EntityRule(TgMessageEntityItalic, '*', '*')
23 | self.strikethrogh = self.EntityRule(TgMessageEntityStrikethrough, '~~', '~~')
24 | self.underlined = self.EntityRule(TgMessageEntityUnderlined, '', '')
25 | self.url = self.EntityRuleUrl()
26 | self.spoiler = self.EntityRule(TgMessageEntitySpoiler, '', '')
27 | self.emoji = self.EntityRule(TgMessageEntityEmoji, '', '')
28 |
29 | # global entity rule set for markdown.
30 | RULE_SET_MD = TgEntityRuleSetMarkdown()
31 |
32 | def dump_posts(posts: list[TgPost], file: any, fmode: str = 'a') -> None:
33 | if isinstance(file, str):
34 | file = open(file, mode=fmode)
35 | opened_localy = True
36 | else:
37 | opened_localy = False
38 |
39 | def out(string: str) -> None:
40 | file.write(string)
41 |
42 | for post in posts:
43 | views_str = f'with {post.views} views.' if post.views != '' else ''
44 | out(f'### [{post.author.display_name}]({post.author.url}): [post]({post.url}) at {post.timestamp} {views_str} \n')
45 |
46 | if post.has_forward():
47 | out(f'**📰 forwarded from**: [{post.forwarded_from.name}]({post.forwarded_from.url}) \n')
48 |
49 | if post.has_reply():
50 | out(f'**✉️ reply**: [{post.reply.author_name}]({post.reply.url}) \n')
51 | out(f'**✉️ reply metatext**: {post.reply.metatext} \n')
52 |
53 | if post.content != '':
54 | content = dump_content(post.content, post.entities, RULE_SET_MD)
55 | content = content.replace('\n', ' \n')
56 | out(f'{content} \n')
57 |
58 | if post.has_sticker():
59 | if not post.sticker.animated:
60 | out(f'[🗿 Sticker]({post.sticker.image_url}) \n')
61 | else:
62 | out(f'[🗿 Sticker]({post.sticker.video_url}) [thumb]({post.sticker.image_url}) \n')
63 |
64 | if post.has_not_supported:
65 | out(f'~~⚠️ Post has not supported media !~~ \n')
66 |
67 | if post.has_voice():
68 | out(f'[🔊 {post.voice.duration}]({post.voice.url}) \n')
69 |
70 | if post.has_rounded_video():
71 | out(f'[📹 {post.rounded_video.duration}]({post.rounded_video.url}) \n  \n')
72 |
73 | if post.has_documents():
74 | for doc in post.documents:
75 | emoji = '🎵' if doc.type == TG_DOCUMENT_AUDIO else '💾'
76 | extra = f' - {doc.extra}' if doc.extra != '' else ''
77 | out(f'[{emoji} file]({doc.url}): **{doc.title}**{extra} \n')
78 |
79 | if post.has_images():
80 | for img in post.images:
81 | out(f' \n')
82 |
83 | if post.has_videos():
84 | for vid in post.videos:
85 | if vid.url:
86 | out(f'[🎥 video]({vid.url}) \n')
87 |
88 | if post.has_link_previews():
89 | for link in post.link_previews:
90 | out(f'[🔗 link ({link.site_name})]({link.url}): {link.title} - {link.description} \n')
91 | out(f'[🔗 link thumbnail]({link.image_url}) \n')
92 |
93 | if post.has_poll():
94 | out(f'**❔ poll**: {post.poll.question} with {post.poll.voters} voters: \n')
95 | i = 0
96 | for opt in post.poll.options:
97 | i += 1
98 | out(f'{i} ) [{opt.percents}%]: {opt.value} \n')
99 |
100 | if post.has_invoice():
101 | out(f'**💳 invoice**: {post.invoice.title}: {post.invoice.description} \n')
102 |
103 | out(f'\n')
104 |
105 | if opened_localy:
106 | file.close()
107 |
--------------------------------------------------------------------------------
/accless_tg_scraper/classes.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | # Consts
4 | TG_SERVICE_MSG_UNKNOWN = -1
5 | TG_SERVICE_MSG_CHANNEL_CREATED = 0
6 | TG_SERVICE_MSG_CHANNEL_RENAMED = 1
7 | TG_SERVICE_MSG_CHANNEL_PHOTO_REMOVED = 2
8 | TG_SERVICE_MSG_CHANNEL_PHOTO_UPDATED = 3
9 | TG_SERVICE_MSG_LIVE_STREAM_FINISHED = 4
10 | TG_SERVICE_MSG_LIVE_STREAM_SHEDULED = 5
11 | TG_SERVICE_MSG_PINNED = 6
12 |
13 | # Document types:
14 | TG_DOCUMENT_UNKNOWN = -1
15 | TG_DOCUMENT_AUDIO = 0
16 |
17 | class TgChannel():
18 | def __init__(self):
19 | self.url: str = ''
20 | self.avatar: str = ''
21 | self.name: str = ''
22 | self.display_name: str = ''
23 |
24 | class TgChannelInfo(TgChannel):
25 | def __init__(self):
26 | TgChannel.__init__(self)
27 | self.subscribers: str = '' # like '73.2 k'
28 | self.photos: str = ''
29 | self.videos: str = ''
30 | self.links: str = ''
31 | self.description: str = ''
32 | self.has_preview: bool = None # can be parsed from links like 'https://t.me/channel_name'
33 |
34 | class TgPostVoice():
35 | def __init__(self):
36 | self.url: str = ''
37 | self.data_waveform: str = ''
38 | self.data_ogg: str = '' # sometimes empty
39 | self.duration: str = '' # like '0:25'
40 |
41 | class TgPostRoundedVideo():
42 | def __init__(self):
43 | self.url: str = ''
44 | self.thumbnail: str = ''
45 | self.duration: str = ''
46 |
47 | class TgPostImage():
48 | def __init__(self):
49 | self.url: str = ''
50 | self.url_single: str = ''
51 |
52 | class TgPostInvoice():
53 | def __init__(self):
54 | self.title: str = ''
55 | self.description: str = ''
56 |
57 | class TgPostVideo(TgPostImage):
58 | def __init__(self):
59 | TgPostImage.__init__(self)
60 | self.image_url: str = '' # thumbnail
61 |
62 | class TgPostReply():
63 | def __init__(self):
64 | self.author_name: str = ''
65 | self.url: str = ''
66 | self.image_url = ''
67 | self.metatext: str = ''
68 |
69 | class TgPostLinkPreview():
70 | def __init__(self):
71 | self.site_name: str = '' # like 'YouTube'
72 | self.url: str = ''
73 | self.title: str = ''
74 | self.description: str = ''
75 | self.image_url: str = ''
76 |
77 | class TgSticker():
78 | def __init__(self):
79 | self.animated: bool = False
80 | self.image_url: str = ""
81 | self.video_url: str = ""
82 |
83 | class TgPoll():
84 |
85 | class TgPollOption():
86 | def __init__(self):
87 | self.value: str = ''
88 | self.percents: int = -1
89 |
90 | def __init__(self):
91 | self.type: str = '' # Like 'Anonymous poll'
92 | self.question: str = ''
93 | self.options = [] # list of TgPollOption
94 | self.voters: str = '' # like '32.3k'
95 |
96 | class TgEmoji():
97 | """Telegram emoji.
98 | id: Emoji id.
99 | custom: True if its a custom emoji.
100 | animated: True if its animated emoji.
101 | image_url: Original representation of emoji (also available for a custom).
102 | custom_image_url: Custom representation of emoji.
103 | data: Image svg+xml data.
104 | tgs_url: link on .tgs file.
105 |
106 | """
107 | def __init__(self):
108 | self.id: int = -1 # Emoji id.
109 | self.custom: bool = False
110 | self.animated: bool = False
111 | self.image_url: str = ''
112 | self.custom_image_url: str = ''
113 | self.data: str = '' # Image data as text.
114 | self.tgs_url = '' # link on .tgs file.
115 |
116 | class TgMessageEntity():
117 | """Base class for all message entities.
118 | See: https://core.telegram.org/api/entities
119 |
120 | offset: Offset in string.
121 | length: Characters count.
122 | """
123 | def __init__(self, offset: int, length: int):
124 | self.offset = offset # Offset in string
125 | self.length = length # Characters count
126 |
127 | def same_place(self, entity) -> bool:
128 | return (self.offset == entity.offset) and (self.length == entity.length)
129 |
130 | def starts_after(self, entity) -> bool:
131 | """
132 | Returns:
133 | bool: True if current entity start position is bigger than end position of given entity.
134 | """
135 | return (self.offset >= (entity.offset + entity.length))
136 |
137 | def starts_inside(self, entity) -> bool:
138 | """
139 | Returns:
140 | bool: True if current entity starts inside given entity.
141 | """
142 | return (not self.starts_after(entity)) and (self.offset >= entity.offset)
143 |
144 | class TgMessageEntityUrl(TgMessageEntity):
145 | """Message entity with text and url behind the text.
146 | """
147 | def __init__(self, offset: int = 0, length: int = 0, url: str = ''):
148 | TgMessageEntity.__init__(self, offset, length)
149 | # self.text: str = text
150 | self.url: str = url
151 |
152 | class TgMessageEntityEmoji(TgMessageEntity):
153 | """Message entity with telegram emoji.
154 | """
155 | def __init__(self, offset: int = 0, length: int = 0):
156 | TgMessageEntity.__init__(self, offset, length)
157 | self.emoji: TgEmoji = None
158 |
159 | class TgMessageEntityBold(TgMessageEntity):
160 | """Message entity with bold text.
161 | """
162 | def __init__(self, offset: int = 0, length: int = 0):
163 | TgMessageEntity.__init__(self, offset, length)
164 |
165 | class TgMessageEntityItalic(TgMessageEntity):
166 | """Message entity with italic text.
167 | """
168 | def __init__(self, offset: int = 0, length: int = 0):
169 | TgMessageEntity.__init__(self, offset, length)
170 |
171 | class TgMessageEntityStrikethrough(TgMessageEntity):
172 | """Message entity with Strikethrough text.
173 | """
174 | def __init__(self, offset: int = 0, length: int = 0):
175 | TgMessageEntity.__init__(self, offset, length)
176 |
177 | class TgMessageEntityUnderlined(TgMessageEntity):
178 | """Message entity with underlined text.
179 | """
180 | def __init__(self, offset: int = 0, length: int = 0):
181 | TgMessageEntity.__init__(self, offset, length)
182 |
183 | class TgMessageEntitySpoiler(TgMessageEntity):
184 | """Message entity with hidden text.
185 | """
186 | def __init__(self, offset: int = 0, length: int = 0):
187 | TgMessageEntity.__init__(self, offset, length)
188 |
189 | class TgServiceMessage():
190 | def __init__(self):
191 | self.type: int = TG_SERVICE_MSG_UNKNOWN
192 | self.extra: str = '' # (url, text) depends on type
193 |
194 | class TgDocument():
195 | def __init__(self):
196 | self.type: int = TG_DOCUMENT_UNKNOWN
197 | self.url: str = ''
198 | self.title: str = ''
199 | self.extra: str = ''
200 |
201 | class TgPost():
202 | def __init__(self):
203 | self.url: str = ''
204 | self.id: int = -1
205 | # self.type: int = TG_MESSAGE
206 | self.content: str = ''
207 | self.entities: list[TgMessageEntity] = []
208 | self.timestamp: datetime = datetime.now()
209 | self.author: TgChannel = TgChannel()
210 | self.views: str = '' # like '1.8k'
211 | self.images: list[TgPostImage] = []
212 | self.videos: list[TgPostVideo] = []
213 | self.documents: list[TgDocument] = [] # list of attached files
214 | self.voice: TgPostVoice = None
215 | self.rounded_video: TgPostRoundedVideo = None
216 | self.link_previews: list[TgPostLinkPreview] = []
217 | self.has_not_supported: bool = False # Media is too big : VIEW IN TELEGRAM
218 | self.forwarded_from: TgChannel = None
219 | self.reply: TgPostReply = None
220 | self.sticker: TgSticker = None
221 | self.poll: TgPoll = None
222 | self.invoice: TgPostInvoice = None
223 | self.service_msg: TgServiceMessage = None
224 |
225 | def has_service_msg(self) -> bool:
226 | return self.service_msg != None
227 |
228 | def has_forward(self) -> bool:
229 | return self.forwarded_from != None
230 |
231 | def has_reply(self) -> bool:
232 | return self.reply != None
233 |
234 | def has_sticker(self) -> bool:
235 | return self.sticker != None
236 |
237 | def has_voice(self) -> bool:
238 | return self.voice != None
239 |
240 | def has_rounded_video(self) -> bool:
241 | return self.rounded_video != None
242 |
243 | def has_images(self) -> bool:
244 | return len(self.images) > 0
245 |
246 | def has_videos(self) -> bool:
247 | return len(self.videos) > 0
248 |
249 | def has_entities(self) -> bool:
250 | return len(self.entities) > 0
251 |
252 | def has_link_previews(self) -> bool:
253 | return len(self.link_previews) > 0
254 |
255 | def has_documents(self) -> bool:
256 | return len(self.documents) > 0
257 |
258 | def has_poll(self) -> bool:
259 | return self.poll != None
260 |
261 | def has_invoice(self) -> bool:
262 | return self.invoice != None
263 |
264 | class TgPostsPage():
265 | def __init__(self):
266 | self.posts: list[TgPost] = []
267 | self.channel = TgChannelInfo() # channel info from right column on web page
268 |
--------------------------------------------------------------------------------
/accless_tg_scraper/parser.py:
--------------------------------------------------------------------------------
1 | from accless_tg_scraper.classes import *
2 | from bs4 import BeautifulSoup
3 | from datetime import datetime
4 | from typing import Tuple
5 | import re
6 |
7 | TELEGRAM_WEB_URL = 'https://t.me'
8 |
9 | def channel_name_from_url(url: str, base_url: str = TELEGRAM_WEB_URL+'/') -> str:
10 | base = url.find(base_url)
11 | if base != -1:
12 | res = url[base+len(base_url):]
13 | snslash = res[:2]
14 | res = res[2:] if snslash == 's/' else res
15 | last = res.rfind('/')
16 | last = res.rfind('?') if last == -1 else last
17 | res = res[:last] if last != -1 else res
18 | return res
19 | else:
20 | return url
21 |
22 | def post_id_from_url(url: str, base_url: str = TELEGRAM_WEB_URL+'/') -> int:
23 | base_end = url.find(base_url)
24 | if base_end != -1:
25 | base_end += len(base_url)
26 |
27 | next_slash = url.find('/', base_end+1)
28 | if next_slash != -1:
29 | res = url[next_slash+1:]
30 | params_sign = res.find('?')
31 | if params_sign != -1:
32 | res = res[:params_sign]
33 | return res
34 | return None # default
35 |
36 | def parse_bg_image_url(style_str: str) -> str:
37 | return re.search("background-image:url\('(.*?)'\)", style_str).group(1)
38 |
39 | def parse_emoji(element: BeautifulSoup) -> TgEmoji:
40 | res = TgEmoji()
41 | if 'emoji-id' in element.attrs:
42 | res.id = element['emoji-id'] # Emoji id.
43 |
44 | tg_emoji_wrap = element.find(class_='tg-emoji-wrap')
45 | if tg_emoji_wrap is not None:
46 |
47 | tg_emoji: BeautifulSoup = tg_emoji_wrap.find(class_='tg-emoji')
48 | if tg_emoji is not None:
49 | res.custom = True
50 | res.custom_image_url = tg_emoji['data-webp']
51 |
52 | if res.custom_image_url in ('', None):
53 | # tgs file url.
54 | source_tgs = tg_emoji.find(attrs={'type': 'application/x-tgsticker'})
55 | res.tgs_url = source_tgs['srcset'] if source_tgs is not None else ''
56 |
57 | # svg+xml data.
58 | source_xml = tg_emoji.find(attrs={'type': 'image/svg+xml'})
59 | res.data = source_xml['srcset'] if source_xml is not None else ''
60 |
61 | # Original enoji image url.
62 | emoji = element.find(class_='emoji')
63 | if emoji is not None:
64 | res.image_url = parse_bg_image_url(emoji['style'])
65 |
66 | return res
67 |
68 | def parse_text_with_entities(element: BeautifulSoup) -> Tuple[str, list[TgMessageEntity]]:
69 | """
70 | Args:
71 | element (BeautifulSoup): element for parse text from.
72 |
73 | Returns:
74 | Tuple: first item is a full text, second item is a list of entities.
75 | """
76 | FIX_ISSUES = True
77 | entities = []
78 | full_text: str = ''
79 |
80 | def create_entity(subject: BeautifulSoup) -> TgMessageEntity:
81 | res = None
82 | if 'class' in subject.attrs:
83 | l_classes = subject.attrs['class']
84 | else:
85 | l_classes = []
86 |
87 | if (subject.name == 'tg-emoji') or ('emoji' in l_classes): # Telegram emoji.
88 | res = TgMessageEntityEmoji()
89 | res.emoji = parse_emoji(subject)
90 | elif (subject.name == 'b') or ('tgme_widget_service_strong_text' in l_classes): # Bold text.
91 | res = TgMessageEntityBold()
92 | elif subject.name == 'i': # Italic text.
93 | res = TgMessageEntityItalic()
94 | elif subject.name == 'a': # Hyperlink or user mention.
95 | res = TgMessageEntityUrl()
96 | res.url = subject['href']
97 | elif subject.name == 'u': # Underlined text.
98 | res = TgMessageEntityUnderlined()
99 | elif subject.name == 's': # Strikethrough text.
100 | res = TgMessageEntityStrikethrough()
101 | elif subject.name == 'tg-spoiler':
102 | res = TgMessageEntitySpoiler()
103 | # elif subject.name == '':
104 | # pass
105 | return res
106 |
107 | def parse_entities(subject: BeautifulSoup, work_on_br: bool = True):
108 | DISALLOW_EMPTY_ENTITIES = True
109 |
110 | nonlocal full_text
111 | nonlocal entities
112 |
113 | for el in subject:
114 | if el.name is not None: # Is not just a text.
115 | if (el.name == 'br'):
116 | # br tag must break line like on the web-page.
117 | if work_on_br:
118 | full_text += '\n'
119 | else:
120 | # Create entity.
121 | allow_entity = True
122 | current_offset: int = len(full_text)
123 | entity: TgMessageEntity = create_entity(el)
124 | if entity is not None:
125 | entity.offset = current_offset
126 |
127 | parse_entities(el, True)
128 | entity.length = len(full_text) - entity.offset
129 |
130 | # Fixing entities that starts or ends with whitespace.
131 | if FIX_ISSUES:
132 | s = full_text[entity.offset : entity.offset + entity.length]
133 | diff = entity.length - len(s.lstrip())
134 | rdiff = entity.length - len(s.rstrip())
135 | entity.offset += diff
136 | entity.length -= (diff + rdiff)
137 |
138 | if DISALLOW_EMPTY_ENTITIES:
139 | if entity.length < 1:
140 | allow_entity = False
141 |
142 | if allow_entity:
143 | entities.append(entity)
144 | else:
145 | parse_entities(el, True)
146 | else:
147 | full_text += el.text
148 |
149 | parse_entities(element)
150 |
151 | if FIX_ISSUES:
152 | stop = len(entities)
153 | for i in range(0, stop):
154 | ent = entities[i]
155 |
156 | # Cleaning entities inside emojis.
157 | if isinstance(ent, TgMessageEntityEmoji):
158 | for n in range(0, stop):
159 | e = entities[n]
160 | if (e is ent) or (e is None):
161 | continue
162 | if ent.same_place(e):
163 | if isinstance(e, TgMessageEntityEmoji):
164 | if ent.emoji.custom:
165 | entities[n] = None
166 | else:
167 | entities[i] = None
168 | break
169 | elif isinstance(e, (TgMessageEntityBold, TgMessageEntityItalic)):
170 | entities[n] = None
171 |
172 | # cleaning null objects.
173 | tmp_entities = []
174 | for ent in entities:
175 | if ent is not None:
176 | tmp_entities.append(ent)
177 | entities = tmp_entities
178 |
179 | return full_text, entities
180 |
181 | def parse_channel_info(page: BeautifulSoup) -> TgChannelInfo:
182 | res = TgChannelInfo()
183 | tgme_page = page.find(class_='tgme_page')
184 | # Avatar
185 | photo = tgme_page.find(class_='tgme_page_photo')
186 | res.avatar = photo.find('img')['src']
187 | # Telegram username
188 | res.name = photo.find('a')['href']
189 | eq_sign = res.name.rfind('=')
190 | res.name = res.name[eq_sign+1:]
191 | # Url
192 | res.url = f'{TELEGRAM_WEB_URL}/{res.name}'
193 | # Display name
194 | res.display_name = tgme_page.find(class_='tgme_page_title').find('span').get_text()
195 | # Subscribers count
196 | extra = tgme_page.find(class_='tgme_page_extra')
197 | if not extra is None:
198 | extra = extra.get_text()
199 | s_pos = extra.find(' s')
200 | res.subscribers = extra[:s_pos]
201 | # Description
202 | desc = tgme_page.find(class_='tgme_page_description')
203 | if not desc is None:
204 | res.description = desc.get_text()
205 | preview_btn = tgme_page.find(class_='tgme_page_context_link')
206 | res.has_preview = (not preview_btn is None)
207 | return res
208 |
209 | def parse_right_column_channel_info(page: BeautifulSoup) -> TgChannelInfo:
210 | res = TgChannelInfo()
211 | res.has_preview = True
212 | tgme_channel_info = page.find(class_="tgme_channel_info")
213 | header = tgme_channel_info.find(class_='tgme_channel_info_header')
214 | # Avatar
215 | photo = header.find(class_='tgme_page_photo_image')
216 | res.avatar = photo.find('img')['src']
217 | title = header.find(class_='tgme_channel_info_header_title')
218 | # Display name
219 | res.display_name = header.find(class_='tgme_channel_info_header_title').find('span').get_text()
220 | # Url
221 | res.url = header.find(class_='tgme_channel_info_header_username').find('a')['href']
222 | # Telegram username
223 | res.name = channel_name_from_url(res.url)
224 | # All counters (subscribers, photos, videos, links)
225 | counters = tgme_channel_info.find(class_='tgme_channel_info_counters')
226 | counters = counters.find_all(class_='tgme_channel_info_counter')
227 | for counter in counters:
228 | value = counter.find(class_='counter_value').get_text()
229 | name = counter.find(class_='counter_type').get_text()
230 | setattr(res, name, value)
231 | # Description
232 | desc = tgme_channel_info.find(class_='tgme_channel_info_description')
233 | if not desc is None:
234 | res.description = desc.get_text()
235 | return res
236 |
237 | def parse_post_from_node(p: BeautifulSoup) -> TgPost:
238 | new_post = TgPost()
239 | tgme_widget_message = p.find(class_="tgme_widget_message", recursive=False)
240 | new_post.url = f"{TELEGRAM_WEB_URL}/{tgme_widget_message['data-post']}"
241 | u = new_post.url
242 | new_post.id = int(u[u.rfind('/')+1:])
243 |
244 | # Author
245 | tgme_widget_message_user = p.find(class_="tgme_widget_message_user")
246 | tgme_widget_message_user_photo = tgme_widget_message_user.find(class_="tgme_widget_message_user_photo")
247 | try:
248 | # Sometimes this url does not exist in the web page.
249 | new_post.author.url = str(tgme_widget_message_user.find("a")["href"])
250 | except:
251 | pass
252 |
253 | new_post.author.avatar = str(tgme_widget_message_user_photo.find("img")["src"])
254 | new_post.author.name = channel_name_from_url(new_post.author.url)
255 |
256 | # Author display_name
257 | tgme_widget_message_owner_name = p.find(class_="tgme_widget_message_owner_name")
258 | if not (tgme_widget_message_owner_name is None):
259 | try:
260 | span = tgme_widget_message_owner_name.find('span')
261 | new_post.author.display_name = span.get_text()
262 | except:
263 | pass
264 |
265 | # Text content
266 | tgme_widget_message_text = p.find_all(class_="tgme_widget_message_text")
267 | if len(tgme_widget_message_text) > 0:
268 | if len(tgme_widget_message_text) > 1:
269 | message_text_elem = tgme_widget_message_text[1]
270 | else:
271 | message_text_elem = tgme_widget_message_text[0]
272 | new_post.content, new_post.entities = parse_text_with_entities(message_text_elem)
273 |
274 | # Service message
275 | service_message = p.find(class_='service_message')
276 | if service_message is not None:
277 | service_msg = TgServiceMessage()
278 | if new_post.content.startswith('Live stream scheduled'):
279 | service_msg.type = TG_SERVICE_MSG_LIVE_STREAM_SHEDULED
280 | elif new_post.content.startswith('Live stream finished'):
281 | service_msg.type = TG_SERVICE_MSG_LIVE_STREAM_FINISHED
282 | elif new_post.content.startswith(f'{new_post.author.display_name} pinned'):
283 | service_msg.type = TG_SERVICE_MSG_PINNED
284 | elif new_post.content.startswith('Channel photo updated'):
285 | service_msg.type = TG_SERVICE_MSG_CHANNEL_PHOTO_UPDATED
286 | service_photo = p.find(class_='tgme_widget_message_service_photo')
287 | if service_photo is not None:
288 | img = service_photo.find('img')
289 | service_msg.extra = img['src'] if img is not None else ''
290 | elif new_post.content.startswith('Channel photo removed'):
291 | service_msg.type = TG_SERVICE_MSG_CHANNEL_PHOTO_REMOVED
292 | elif new_post.content.startswith('Channel name was changed to'):
293 | service_msg.type = TG_SERVICE_MSG_CHANNEL_RENAMED
294 | elif new_post.content.startswith('Channel created'):
295 | service_msg.type = TG_SERVICE_MSG_CHANNEL_CREATED
296 | else:
297 | service_msg.type = TG_SERVICE_MSG_UNKNOWN
298 |
299 | if service_msg.type in (TG_SERVICE_MSG_CHANNEL_RENAMED, TG_SERVICE_MSG_PINNED):
300 | strong_text = service_message.find(class_='tgme_widget_service_strong_text')
301 | if strong_text is not None:
302 | service_msg.extra = strong_text.text
303 | elif service_msg.type in (TG_SERVICE_MSG_LIVE_STREAM_FINISHED, TG_SERVICE_MSG_LIVE_STREAM_SHEDULED):
304 | try:
305 | service_msg.extra = re.search("\((.*?)\)", new_post.content).group(1)
306 | except:
307 | pass
308 |
309 | new_post.service_msg = service_msg
310 |
311 | # Reply info
312 | tgme_widget_message_reply = p.find(class_="tgme_widget_message_reply")
313 | if not tgme_widget_message_reply is None:
314 | new_post.reply = TgPostReply()
315 | new_post.reply.url = tgme_widget_message_reply['href']
316 | new_post.reply.author_name = tgme_widget_message_reply.find(class_="tgme_widget_message_author_name").get_text()
317 | try:
318 | tgme_widget_message_metatext = tgme_widget_message_reply.find(class_="tgme_widget_message_metatext")
319 | if not (tgme_widget_message_metatext is None):
320 | new_post.reply.metatext = tgme_widget_message_metatext.get_text()
321 | style = tgme_widget_message_reply.find(class_="tgme_widget_message_reply_thumb")['style']
322 | new_post.reply.image_url = parse_bg_image_url(style)
323 | except:
324 | pass
325 |
326 | # Forwarded from
327 | try:
328 | tgme_widget_message_forwarded_from_name = p.find(class_="tgme_widget_message_forwarded_from_name")
329 | if not tgme_widget_message_forwarded_from_name is None:
330 | new_post.forwarded_from = TgChannel()
331 | new_post.forwarded_from.name = tgme_widget_message_forwarded_from_name.find('span').get_text()
332 | new_post.forwarded_from.url = tgme_widget_message_forwarded_from_name['href']
333 | except:
334 | pass
335 |
336 | # Rounded video
337 | rounded_vid = p.find(class_="tgme_widget_message_roundvideo_player")
338 | if not (rounded_vid is None):
339 | new_post.rounded_video = TgPostRoundedVideo()
340 | thumb = rounded_vid.find(class_='tgme_widget_message_roundvideo_thumb')
341 | new_post.rounded_video.thumbnail = parse_bg_image_url(thumb['style'])
342 | vid = rounded_vid.find(class_='tgme_widget_message_roundvideo')
343 | new_post.rounded_video.url = vid['src']
344 | duration = rounded_vid.find(class_='tgme_widget_message_roundvideo_duration')
345 | new_post.rounded_video.duration = duration.get_text()
346 |
347 | # Voice
348 | voice_player = p.find(class_='tgme_widget_message_voice_player')
349 | if not (voice_player is None):
350 | voice = voice_player.find(class_="tgme_widget_message_voice")
351 | if not (voice is None):
352 | new_post.voice = TgPostVoice()
353 | new_post.voice.url = voice['src']
354 | try:
355 | duration = voice_player.find(class_="tgme_widget_message_voice_duration")
356 | new_post.voice.duration = duration.get_text()
357 | new_post.voice.data_waveform = voice['data-waveform']
358 | new_post.voice.data_ogg = voice['data-ogg']
359 | except:
360 | pass
361 |
362 | # Images
363 | images = p.find_all(class_="tgme_widget_message_photo_wrap")
364 | for image in images:
365 | new_image = TgPostImage()
366 | style = image["style"]
367 | new_image.url = parse_bg_image_url(style)
368 | new_image.url_single = image["href"]
369 | new_post.images.append(new_image)
370 |
371 | # Supported videos
372 | videos = p.find_all(class_="tgme_widget_message_video_player")
373 | for vid in videos:
374 | new_video = TgPostVideo()
375 | style = ''
376 | thumb = vid.find(class_="tgme_widget_message_video_thumb")
377 | if thumb is not None and 'style' in thumb.attrs:
378 | style = thumb['style']
379 |
380 | try:
381 | new_video.image_url = parse_bg_image_url(style)
382 | except:
383 | pass
384 |
385 | try:
386 | new_video.url = vid.find(class_="tgme_widget_message_video")['src']
387 | except:
388 | pass
389 |
390 | new_video.url_single = vid['href']
391 | new_post.videos.append(new_video)
392 |
393 | # Link previews
394 | link_previews = p.find_all(class_="tgme_widget_message_link_preview")
395 | for prev in link_previews:
396 | new_prev = TgPostLinkPreview()
397 | new_prev.url = prev["href"]
398 |
399 | try:
400 | thumb = prev.find(class_="link_preview_image")
401 | if thumb is None:
402 | thumb = prev.find(class_="link_preview_right_image")
403 | if not thumb is None:
404 | style = thumb['style']
405 | new_prev.image_url = parse_bg_image_url(style)
406 |
407 | new_prev.title = prev.find(class_="link_preview_title").get_text()
408 | new_prev.description = prev.find(class_="link_preview_description").get_text()
409 | except:
410 | pass
411 |
412 | link_preview_site_name = prev.find(class_='link_preview_site_name')
413 | if link_preview_site_name is not None:
414 | new_prev.site_name = link_preview_site_name.get_text()
415 | new_post.link_previews.append(new_prev)
416 |
417 | # Documents
418 | docs = p.find_all(class_='tgme_widget_message_document_wrap', recursive=True)
419 | for doc in docs:
420 | new_doc = TgDocument()
421 | tmp_obj = doc.find(class_='audio')
422 | if tmp_obj is not None:
423 | new_doc.type = TG_DOCUMENT_AUDIO
424 | else:
425 | new_doc.type = TG_DOCUMENT_UNKNOWN
426 |
427 | new_doc.url = doc['href'] if 'href' in doc.attrs else ''
428 |
429 | title = doc.find(class_='tgme_widget_message_document_title')
430 | new_doc.title = title.text if title is not None else ''
431 |
432 | extra = doc.find(class_='tgme_widget_message_document_extra')
433 | new_doc.extra = extra.text if extra is not None else ''
434 |
435 | new_post.documents.append(new_doc)
436 |
437 | # Views
438 | tgme_widget_message_views = p.find(class_="tgme_widget_message_views")
439 | if not tgme_widget_message_views is None:
440 | new_post.views = str(tgme_widget_message_views.get_text())
441 |
442 | # Timestamp
443 | tgme_widget_message_date = p.find(class_="tgme_widget_message_date")
444 | time = tgme_widget_message_date.find("time")
445 | new_post.timestamp = datetime.fromisoformat(time["datetime"])
446 |
447 | # Sticker
448 | tgme_widget_message_sticker_wrap = p.find(class_='tgme_widget_message_sticker_wrap')
449 | if not tgme_widget_message_sticker_wrap is None:
450 | new_post.sticker = TgSticker()
451 |
452 | # static sticker
453 | tgme_widget_message_sticker = p.find(class_='tgme_widget_message_sticker')
454 | if not tgme_widget_message_sticker is None:
455 | if 'data-webp' in tgme_widget_message_sticker.attrs:
456 | new_post.sticker.image_url = tgme_widget_message_sticker['data-webp']
457 | elif 'style' in tgme_widget_message_sticker.attrs:
458 | new_post.sticker.image_url = parse_bg_image_url(tgme_widget_message_sticker['style'])
459 |
460 | # Animated sticker
461 | tgme_widget_message_videosticker = p.find(class_='tgme_widget_message_videosticker')
462 | if not tgme_widget_message_videosticker is None:
463 | new_post.sticker.animated = True
464 |
465 | js_videosticker_video = tgme_widget_message_videosticker.find(class_='js-videosticker_video')
466 | if js_videosticker_video is not None:
467 | new_post.sticker.video_url = js_videosticker_video['src']
468 |
469 | webm_sticker_done = js_videosticker_video.find(class_='webm_sticker_done')
470 | if webm_sticker_done is not None:
471 | new_post.sticker.image_url = webm_sticker_done['src']
472 |
473 | if new_post.sticker.image_url == '':
474 | img = js_videosticker_video.find('img')
475 | if img is not None:
476 | new_post.sticker.image_url = img['src']
477 |
478 | # Detect unsupported media
479 | message_media_not_supported_wrap = p.find(class_="message_media_not_supported_wrap")
480 | if not (message_media_not_supported_wrap is None):
481 | message_media_not_supported_label = message_media_not_supported_wrap.find(class_="message_media_not_supported_label")
482 | if not (message_media_not_supported_label is None):
483 | not_support_msg = message_media_not_supported_label.get_text()
484 | new_post.has_not_supported = (not_support_msg.find('in your browser') == -1)
485 | else:
486 | new_post.has_not_supported = True
487 |
488 | # Poll
489 | tgme_widget_message_poll = p.find(class_="tgme_widget_message_poll")
490 | if not (tgme_widget_message_poll is None):
491 | try:
492 | new_post.poll = TgPoll()
493 |
494 | # Question
495 | question = tgme_widget_message_poll.find(class_='tgme_widget_message_poll_question')
496 | new_post.poll.question = question.get_text()
497 |
498 | # Poll type
499 | tgme_widget_message_poll_type = tgme_widget_message_poll.find(class_="tgme_widget_message_poll_type")
500 | new_post.poll.type = tgme_widget_message_poll_type.get_text()
501 |
502 | # Options
503 | tgme_widget_message_poll_options = tgme_widget_message_poll.find(class_='tgme_widget_message_poll_options')
504 | options = tgme_widget_message_poll_options.find_all(class_='tgme_widget_message_poll_option')
505 | for opt in options:
506 | new_opt = TgPoll.TgPollOption()
507 | new_opt.value = opt.find(class_="tgme_widget_message_poll_option_value").get_text().strip()
508 | percents = opt.find(class_='tgme_widget_message_poll_option_percent')
509 | if not (percents is None):
510 | percents = percents.get_text()
511 | percents = percents[:len(percents)-1]
512 | new_opt.percents = int(percents)
513 | new_post.poll.options.append(new_opt)
514 |
515 | # Voters count
516 | voters = p.find(class_="tgme_widget_message_voters")
517 | if not (voters is None):
518 | new_post.poll.voters = voters.get_text()
519 | else:
520 | voters = p.find(class_="tgme_widget_message_poll_votes")
521 | voters = voters.get_text()
522 | space_pos = voters.rfind(' ')
523 | if space_pos != -1:
524 | voters = voters[:space_pos-1]
525 | new_post.poll.voters = voters
526 | except:
527 | pass
528 |
529 | # Invoice
530 | tgme_widget_message_invoice = p.find(class_="tgme_widget_message_invoice")
531 | if not (tgme_widget_message_invoice is None):
532 | new_post.invoice = TgPostInvoice()
533 | title = tgme_widget_message_invoice.find(class_='tgme_widget_message_invoice_title')
534 | new_post.invoice.title = title.get_text()
535 | desc = tgme_widget_message_invoice.find(class_='tgme_widget_message_invoice_description')
536 | new_post.invoice.description = desc.get_text()
537 |
538 | return new_post
539 |
540 | def parse_widget_post(page: BeautifulSoup) -> TgPost:
541 | p = page.find(class_="widget_frame_base")
542 | return parse_post_from_node(p)
543 |
544 | def parse_posts(page: BeautifulSoup) -> []:
545 | history = page.find(class_="tgme_channel_history")
546 | p_posts = history.find_all(class_="tgme_widget_message_wrap", recursive=False)
547 | posts = []
548 |
549 | for p in p_posts:
550 | new_post = parse_post_from_node(p)
551 | posts.append(new_post)
552 |
553 | return posts
554 |
555 | def parse_posts_page(page: BeautifulSoup):
556 | res = TgPostsPage()
557 | res.posts = parse_posts(page)
558 | res.channel = parse_right_column_channel_info(page)
559 | return res
560 |
--------------------------------------------------------------------------------