├── accless_tg_scraper
    ├── __init__.py
    ├── client.py
    ├── serialize
    │   ├── classes.py
    │   └── markdown.py
    ├── classes.py
    └── parser.py
├── .gitignore
├── tests
    ├── download-samples.sh
    ├── text-entities.py
    ├── client-test.py
    ├── parser-test.py
    └── tg_tests.py
├── setup.py
└── README.md


/accless_tg_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | from .client import *
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | /accless_tg_scraper.egg-info
3 | /build
4 | *.html
5 | reinstall.sh
6 | install-e-mode.sh


--------------------------------------------------------------------------------
/tests/download-samples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | rm -f tg-posts*.html tg-single-post*.html
 3 | id=0
 4 | function add () {
 5 |     local url=$1
 6 |     local url_type=$2
 7 |     id=$(($id + 1))
 8 |     if [ $url_type == 1 ];
 9 |     then
10 |         curl $url -fo tg-posts-$id.html
11 |     else
12 |         curl "$url?embed=1&mode=tme" -fo tg-single-post-$id.html
13 |     fi
14 | }
15 | 
16 | add "https://t.me/s/evgenii_ponasenkov" 1
17 | add "https://t.me/evgenii_ponasenkov/7561" 2
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name = 'accless_tg_scraper',
 5 |     version = '0.2.0',
 6 |     author = 'Kisspeace',
 7 |     keywords = 'telegram scraper parser web',
 8 |     url = 'http://github.com/Kisspeace/accless-tg-scraper',
 9 |     description = 'Scrap telegram web WITHOUT account or API token',
10 |     packages = ['accless_tg_scraper',
11 |                 'accless_tg_scraper.serialize'],
12 |     install_requires = [
13 |         'aiohttp',
14 |         'bs4'
15 |     ]
16 | )
17 | 


--------------------------------------------------------------------------------
/tests/text-entities.py:
--------------------------------------------------------------------------------
 1 | from accless_tg_scraper.serialize.markdown import *
 2 | from accless_tg_scraper.serialize.classes import *
 3 | from accless_tg_scraper.classes import *
 4 | 
 5 | text = 'Hello. md has no spoilers support'
 6 | entities = [
 7 |     TgMessageEntityItalic(1, 2),
 8 |     TgMessageEntityBold(1, 2),
 9 |     TgMessageEntityUrl(0, 1, 'https://youtu.be/HTMDNZOlUq4'),
10 |     TgMessageEntityStrikethrough(3, 3),
11 |     TgMessageEntityBold(4, 1),
12 |     TgMessageEntitySpoiler(7, 26)
13 | ]
14 | 
15 | md = dump_content(text, entities, RULE_SET_MD)
16 | print(md)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### <img src="https://telegram.org/img/favicon.ico" height="20">  accless-tg-scraper
 2 | Scrap posts from telegram web WITHOUT account or API token
 3 | #### Install
 4 | ```shell
 5 | python -m pip install "git+https://github.com/Kisspeace/accless-tg-scraper.git#egg=accless-tg-scraper" 
 6 | ```
 7 | #### Simple example
 8 | ```python
 9 | import asyncio
10 | from accless_tg_scraper import *
11 | 
12 | async def main():
13 |   telegram = TgScraper()
14 |   page = await telegram.get_posts_page('evgenii_ponasenkov')
15 |   posts = page.posts
16 | 
17 |   print(f'got {len(posts)} posts.')
18 |   for post in posts:
19 |     print(f'{post.url}:{post.content}\n')
20 |     
21 | asyncio.run(main())
22 | ```
23 | 


--------------------------------------------------------------------------------
/tests/client-test.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | import asyncio
 3 | from bs4 import BeautifulSoup
 4 | from accless_tg_scraper.client import *
 5 | from accless_tg_scraper.classes import *
 6 | import re
 7 | from tg_tests import *
 8 | from accless_tg_scraper.serialize.markdown import *
 9 | 
10 | tg = TgScraper()
11 | last_posts_page = None
12 | 
13 | ponasenkov_tg = 'evgenii_ponasenkov'
14 | 
15 | async def get_n_print_channel_info(channel: str, *args, **kwargs):
16 |     channel = await tg.get_channel_info(channel, *args, **kwargs)
17 |     print_channel_info(channel)
18 |     print(DELIM)
19 | 
20 | async def get_n_print(channel: str, *args, **kwargs):
21 |     global last_posts_page
22 |     posts_page = await tg.get_posts_page(channel=channel, *args, **kwargs)
23 |     last_posts_page = posts_page
24 |     print_channel_info(posts_page.channel)
25 |     print_posts(posts_page)
26 | 
27 | async def main():   
28 |     await get_n_print_channel_info(ponasenkov_tg) 
29 |     await get_n_print(ponasenkov_tg)
30 |     await get_n_print(ponasenkov_tg, before=last_posts_page.posts[0].id) 
31 |     post = await tg.get_post(ponasenkov_tg, 7561)
32 |     print_post(post)
33 |     print(DELIM)
34 | if __name__ == '__main__':
35 |     asyncio.run(main())


--------------------------------------------------------------------------------
/tests/parser-test.py:
--------------------------------------------------------------------------------
 1 | #!python
 2 | from bs4 import BeautifulSoup
 3 | from accless_tg_scraper.parser import *
 4 | import re
 5 | from tg_tests import *
 6 | from accless_tg_scraper.serialize.markdown import *
 7 | import os
 8 | import fnmatch
 9 | 
10 | def bs_from_file(filename: str) -> BeautifulSoup:
11 |     fp = open(filename)
12 |     page = BeautifulSoup(fp, 'html.parser')
13 |     return page
14 | 
15 | def test_url_parse(url: str):
16 |     print(F"{channel_name_from_url(url)} from {url}")
17 | 
18 | def test_post_id_parse(url: str):
19 |     print(F"{post_id_from_url(url)} from {url}")
20 | 
21 | test_url_parse('https://t.me/s/channel_name')
22 | test_url_parse('https://t.me/s/channel_name?after=1030')
23 | test_url_parse('https://t.me/channel_name/752')
24 | test_url_parse('https://t.me/channel_name?someparams=sgduh23847tgdhs')
25 | test_url_parse('https://t.me/channel_name')
26 | test_url_parse('channel_name')
27 | 
28 | test_post_id_parse('https://t.me/channel_name/1812')
29 | test_post_id_parse('https://t.me/channel_name/1488?embed=1&mode=tme')
30 | 
31 | posts = []
32 | 
33 | def add_post(filename: str) -> TgPost:
34 |     global posts
35 |     web_page = bs_from_file(filename)
36 |     post = parse_widget_post(web_page)
37 |     print_post(post)
38 |     posts.append(post)
39 |     return post
40 | 
41 | def add_posts(filename: str) -> list[TgPost]:
42 |     global posts
43 |     web_page = bs_from_file(filename)
44 |     new_posts = parse_posts(web_page)
45 |     print_posts(new_posts)
46 |     posts = posts + new_posts
47 | 
48 | for f in os.listdir(os.curdir):
49 |     if fnmatch.fnmatch(f, 'tg-single-post*'):
50 |         add_post(f)
51 |     elif fnmatch.fnmatch(f, 'tg-posts*'):
52 |         add_posts(f)
53 | 
54 | # new = []
55 | # for p in posts:
56 | #     if p.has_service_msg():
57 | #         new.append(p)
58 | #         print(f'service msg_ {p.service_msg.type} : {p.service_msg.extra}')
59 | # posts = new
60 | 
61 | dump_posts(posts, 'dump.md', 'a')
62 | 


--------------------------------------------------------------------------------
/tests/tg_tests.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from bs4 import BeautifulSoup
 3 | from accless_tg_scraper.parser import *
 4 | from accless_tg_scraper.client import *
 5 | from accless_tg_scraper.classes import *
 6 | import re
 7 | 
 8 | DELIM = ''
 9 | 
10 | def print_channel_info(c: TgChannelInfo):
11 |     print(f'{c.display_name} - {c.name} - {c.subscribers} subs, {c.photos} photos, {c.videos} videos, {c.links} links. {c.url}')
12 |     print(f'avatar: {c.avatar}')
13 |     print(f'desc: {c.description}')
14 |     if c.has_preview:
15 |         print(f'channel has preview page.')
16 | 
17 | def print_post(post: TgPost):
18 |     print('(' + post.author.name + ') ' + post.author.url + ' on ' + post.url + ' at ' + str(post.timestamp) + ' with ' + post.views + ' views.')
19 |     
20 |     if post.has_forward():
21 |         print('📰 forwarded from: ' + post.forwarded_from.name + ' : ' + post.forwarded_from.url)
22 |         
23 |     if post.has_reply():
24 |         print('✉️ reply: ' + post.reply.author_name + ' : ' + post.reply.url + ' : ' + post.reply.image_url)
25 |         print('✉️ reply metatext: ' + post.reply.metatext)
26 |         
27 |     if post.content != "":
28 |         print("Text: " +  post.content)
29 |         
30 |     if post.has_sticker():
31 |         if not post.sticker.animated:
32 |             print('🗿 Sticker: ' + post.sticker.image_url)
33 |         else:
34 |             print('🗿 Animated sticker: ' + post.sticker.video_url + ' ' + post.sticker.image_url)
35 |         
36 |     if post.has_not_supported:
37 |         print('⚠️ Post has not supported media !')
38 |         
39 |     if post.has_voice():
40 |         print(f'🔊 {post.voice.duration} -> {post.voice.url}')    
41 |         
42 |     if post.has_rounded_video():
43 |         print(f'📹 {post.rounded_video.duration} -> {post.rounded_video.url}\nthumb: {post.rounded_video.thumbnail}')
44 |     
45 |     if post.has_images():
46 |         for img in post.images:
47 |             print('🌉 image: ' + img.url + ' : ' + img.url_single)
48 |             
49 |     if post.has_videos():
50 |         for vid in post.videos:
51 |             print('🎥 video: ' + vid.url + ' : ' + vid.image_url + ' : ' + vid.url_single)
52 |             
53 |     if post.has_link_previews():
54 |         for link in post.link_previews:
55 |             print('🔗 link (' + link.site_name + '): ' + link.url + ' - ' + link.title + ' - ' + link.description)
56 |             print('🔗 link thumbnail: ' + link.image_url)
57 | 
58 |     if post.has_poll():
59 |         print(f"❔: {post.poll.question} with {post.poll.voters} voters:")
60 |         i = 0
61 |         for opt in post.poll.options:
62 |             i += 1
63 |             print(f"{i} ) [{opt.percents}%]: {opt.value}")
64 | 
65 |     if post.has_invoice():
66 |         print(f"💳: {post.invoice.title}: {post.invoice.description}")
67 | 
68 | def print_posts(posts: any):
69 |     if type(posts) is TgPostsPage:
70 |         posts = posts.posts
71 |     print('Count: ' + str(len(posts)))
72 |     for post in posts:
73 |         print_post(post)
74 |         print(DELIM)


--------------------------------------------------------------------------------
/accless_tg_scraper/client.py:
--------------------------------------------------------------------------------
 1 | import asyncio, aiohttp
 2 | import copy
 3 | from bs4 import BeautifulSoup
 4 | from accless_tg_scraper.classes import *
 5 | from accless_tg_scraper.parser import *
 6 | 
 7 | class TgScraper():
 8 |     def __init__(self):
 9 |         self.base_url: str = TELEGRAM_WEB_URL
10 |         self.timeouts = aiohttp.ClientTimeout(connect=0.6)
11 |         self._headers = {
12 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
13 |             'Accept-Encoding': 'gzip, deflate, br',
14 |             'Accept-Language': 'en-US,en;q=0.5',
15 |             'TE': 'trailers',
16 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:107.0) Gecko/20100101 Firefox/107.0'
17 |         }
18 |         
19 |     def _url_preview(self, channel_name: str) -> str:
20 |         return f"{self.base_url}/s/{channel_name}"
21 |     
22 |     def _url_post_widget(self, channel: str, post_id: int) -> str:
23 |         return f"{self.base_url}/{channel}/{post_id}?embed=1&mode=tme"
24 |     
25 |     def _bs(self, response) -> BeautifulSoup:
26 |         return BeautifulSoup(response, 'html.parser')    
27 | 
28 |     def _new_session(self, *args, **kwargs) -> aiohttp.ClientSession:
29 |         return aiohttp.ClientSession(headers=self._headers, timeout=self.timeouts)
30 |     
31 |     # Setters & getters:
32 | 
33 |     def set_headers(self, headers: dict):
34 |         self._headers = copy.deepcopy(headers)
35 | 
36 |     def get_headers(self) -> dict:
37 |         return copy.deepcopy(self._headers)
38 | 
39 |     # functions
40 | 
41 |     async def get_post(self, channel: str, post_id: int) -> TgPost:
42 |         res = None
43 |         async with self._new_session() as session:
44 |             resp = await session.get(
45 |                 url=self._url_post_widget(channel, post_id))
46 |             text = await resp.text()
47 |         return parse_widget_post(self._bs(text))
48 | 
49 |     async def get_posts_page(self, channel: str, q: str = '', before = '', after = '', full_url: str = '') -> TgPostsPage:
50 |         res = None
51 |         params = {}
52 | 
53 |         if full_url == '':
54 |             url = self._url_preview(channel)  
55 | 
56 |             params = {
57 |                 'q': str(q),
58 |                 'before': str(before),
59 |                 'after': str(after)
60 |             }
61 |         else:
62 |             url = full_url
63 | 
64 |         async with self._new_session() as session:
65 |             resp = await session.get(
66 |                 url=url,
67 |                 params=params)
68 |             text = await resp.text()
69 |             
70 |         res = parse_posts_page(self._bs(text))
71 |         return res
72 |     
73 |     async def get_channel_info(self, channel: str) -> TgChannelInfo:
74 |         res = None
75 |         async with self._new_session() as session:
76 |             resp = await session.get(f'{self.base_url}/{channel}')
77 |             text = await resp.text()
78 |         res = parse_channel_info(self._bs(text))
79 |         return res


--------------------------------------------------------------------------------
/accless_tg_scraper/serialize/classes.py:
--------------------------------------------------------------------------------
 1 | from accless_tg_scraper.classes import *
 2 | from copy import deepcopy
 3 | 
 4 | class TgEntityRuleSet():
 5 |     class EntityRule():
 6 |         def __init__(self, ent_type: TgMessageEntity, prefix: str, postfix: str):
 7 |             self.prefix: str = prefix
 8 |             self.postfix: str = postfix
 9 |             self.type = ent_type
10 | 
11 |         def convert(self, entity: TgMessageEntity, source: str):
12 |             sub_str = source[entity.offset : entity.offset + entity.length]
13 |             return f'{self.prefix}{sub_str}{self.postfix}'
14 | 
15 |     def __init__(self):
16 |         self.bold: self.EntityRule = None
17 |         self.italic: self.EntityRule = None
18 |         self.strikethrogh: self.EntityRule = None
19 |         self.underlined: self.EntityRule = None
20 |         self.url: self.EntityRule = None
21 |         self.spoiler: self.EntityRule = None
22 |         self.emoji: self.EntityRule = None
23 | 
24 |     def get_rules(self) -> list:
25 |         return [
26 |             self.bold,
27 |             self.italic,
28 |             self.strikethrogh,
29 |             self.underlined,
30 |             self.url,
31 |             self.spoiler,
32 |             self.emoji
33 |         ]
34 | 
35 |     def rule_by_type(self, ent_type: TgMessageEntity):
36 |         for rule in self.get_rules():
37 |             if rule.type is ent_type:
38 |                 return rule
39 |         return None
40 | 
41 |     def get_converted(self, entity: TgMessageEntity, source: str):
42 |         rule = self.rule_by_type(type(entity))
43 |         return rule.convert(entity, source)
44 | 
45 | def dump_content(content: str, entities: list, rule_set: TgEntityRuleSet) -> str:
46 |     """
47 |     Args:
48 |         content (str): text.
49 |         entities (list): list of entities for given text.
50 |         rule_set (TgEntityRuleSet): rule set.
51 |     Returns:
52 |         str: post.content converted to custom format with post.entities.
53 |     """
54 | 
55 |     if len(entities) < 1:
56 |         return content
57 | 
58 |     entities = deepcopy(entities)
59 |     res: str = content
60 | 
61 |     def replace_with(offset: int, length: int, string: str):
62 |         nonlocal res
63 |         slice_a = res[0 : offset]
64 |         slice_b = res[offset + length : len(res)]
65 |         res = slice_a + string + slice_b
66 | 
67 |     l = len(entities)
68 |     for i in range(0, l):
69 |         ent = entities[i]
70 |         rule = rule_set.rule_by_type(type(ent))
71 |         converted = rule.convert(ent, res)
72 |         replace_with(ent.offset, ent.length, converted)
73 | 
74 |         # now need to remap all next entities.
75 |         for n in range(i + 1, l):
76 |             next_ent = entities[n]
77 | 
78 |             # info about next entity.
79 |             start_after = next_ent.starts_after(ent)
80 |             start_inside = next_ent.starts_inside(ent)
81 | 
82 |             if start_after:
83 |                 next_ent.offset += (len(converted) - ent.length)
84 |             elif start_inside:
85 |                 next_ent.offset += len(rule.prefix)
86 | 
87 |     return res
88 | 


--------------------------------------------------------------------------------
/accless_tg_scraper/serialize/markdown.py:
--------------------------------------------------------------------------------
  1 | from accless_tg_scraper.classes import *
  2 | from accless_tg_scraper.serialize.classes import *
  3 | from copy import deepcopy
  4 | 
  5 | class TgEntityRuleSetMarkdown(TgEntityRuleSet):
  6 | 
  7 |     class EntityRuleUrl(TgEntityRuleSet.EntityRule):
  8 |         def __init__(self, ent_type=TgMessageEntityUrl, prefix='[', postfix=']'):
  9 |             TgEntityRuleSet.EntityRule.__init__(self, ent_type=TgMessageEntityUrl, prefix='[', postfix=']')
 10 |             self.prefix = prefix
 11 |             self.postfix = postfix
 12 |             self.url_prefix: str = '('
 13 |             self.url_postfix: str = ')'
 14 | 
 15 |         def convert(self, entity: TgMessageEntity, source: str):
 16 |             sub_str = source[entity.offset : entity.offset + entity.length]
 17 |             return f'{self.prefix}{sub_str}{self.postfix}{self.url_prefix}{entity.url}{self.url_postfix}'
 18 | 
 19 |     def __init__(self):
 20 |         TgEntityRuleSet.__init__(self)
 21 |         self.bold = self.EntityRule(TgMessageEntityBold,  '**', '**')
 22 |         self.italic = self.EntityRule(TgMessageEntityItalic, '*', '*')
 23 |         self.strikethrogh = self.EntityRule(TgMessageEntityStrikethrough, '~~', '~~')
 24 |         self.underlined = self.EntityRule(TgMessageEntityUnderlined, '<u>', '</u>')
 25 |         self.url = self.EntityRuleUrl()
 26 |         self.spoiler = self.EntityRule(TgMessageEntitySpoiler, '', '')
 27 |         self.emoji = self.EntityRule(TgMessageEntityEmoji, '', '')
 28 | 
 29 | # global entity rule set for markdown.
 30 | RULE_SET_MD = TgEntityRuleSetMarkdown()
 31 | 
 32 | def dump_posts(posts: list[TgPost], file: any, fmode: str = 'a') -> None:
 33 |     if isinstance(file, str):
 34 |         file = open(file, mode=fmode)
 35 |         opened_localy = True
 36 |     else:
 37 |         opened_localy = False
 38 | 
 39 |     def out(string: str) -> None:
 40 |         file.write(string)
 41 | 
 42 |     for post in posts:
 43 |         views_str = f'with {post.views} views.' if post.views != '' else ''
 44 |         out(f'### [{post.author.display_name}]({post.author.url}): [post]({post.url}) at {post.timestamp} {views_str}  \n')
 45 | 
 46 |         if post.has_forward():
 47 |             out(f'**📰 forwarded from**: [{post.forwarded_from.name}]({post.forwarded_from.url})  \n')
 48 | 
 49 |         if post.has_reply():
 50 |             out(f'**✉️ reply**: [{post.reply.author_name}]({post.reply.url})  \n')
 51 |             out(f'**✉️ reply metatext**: {post.reply.metatext}  \n')
 52 | 
 53 |         if post.content != '':
 54 |             content = dump_content(post.content, post.entities, RULE_SET_MD)
 55 |             content = content.replace('\n', '  \n')
 56 |             out(f'{content}  \n')
 57 | 
 58 |         if post.has_sticker():
 59 |             if not post.sticker.animated:
 60 |                 out(f'[🗿 Sticker]({post.sticker.image_url})  \n')
 61 |             else:
 62 |                 out(f'[🗿 Sticker]({post.sticker.video_url}) [thumb]({post.sticker.image_url})  \n')
 63 | 
 64 |         if post.has_not_supported:
 65 |             out(f'~~⚠️ Post has not supported media !~~  \n')
 66 | 
 67 |         if post.has_voice():
 68 |             out(f'[🔊 {post.voice.duration}]({post.voice.url})  \n')    
 69 | 
 70 |         if post.has_rounded_video():
 71 |             out(f'[📹 {post.rounded_video.duration}]({post.rounded_video.url})  \n ![thumbnail]({post.rounded_video.thumbnail})  \n')
 72 | 
 73 |         if post.has_documents():
 74 |             for doc in post.documents:
 75 |                 emoji = '🎵' if doc.type == TG_DOCUMENT_AUDIO else '💾'
 76 |                 extra = f' - {doc.extra}' if doc.extra != '' else ''
 77 |                 out(f'[{emoji} file]({doc.url}): **{doc.title}**{extra}  \n')
 78 | 
 79 |         if post.has_images():
 80 |             for img in post.images:
 81 |                 out(f'![🌉 image]({img.url})  \n')
 82 | 
 83 |         if post.has_videos():
 84 |             for vid in post.videos:
 85 |                 if vid.url:
 86 |                     out(f'[🎥 video]({vid.url})  \n')
 87 | 
 88 |         if post.has_link_previews():
 89 |             for link in post.link_previews:
 90 |                 out(f'[🔗 link ({link.site_name})]({link.url}): {link.title} - {link.description}  \n')
 91 |                 out(f'[🔗 link thumbnail]({link.image_url})  \n')
 92 | 
 93 |         if post.has_poll():
 94 |             out(f'**❔ poll**: {post.poll.question} with {post.poll.voters} voters:  \n')
 95 |             i = 0
 96 |             for opt in post.poll.options:
 97 |                 i += 1
 98 |                 out(f'{i} ) [{opt.percents}%]: {opt.value}  \n')
 99 | 
100 |         if post.has_invoice():
101 |             out(f'**💳 invoice**: {post.invoice.title}: {post.invoice.description}  \n')
102 | 
103 |         out(f'\n')
104 | 
105 |     if opened_localy:
106 |         file.close()
107 | 


--------------------------------------------------------------------------------
/accless_tg_scraper/classes.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | # Consts
  4 | TG_SERVICE_MSG_UNKNOWN = -1
  5 | TG_SERVICE_MSG_CHANNEL_CREATED = 0
  6 | TG_SERVICE_MSG_CHANNEL_RENAMED = 1
  7 | TG_SERVICE_MSG_CHANNEL_PHOTO_REMOVED = 2
  8 | TG_SERVICE_MSG_CHANNEL_PHOTO_UPDATED = 3
  9 | TG_SERVICE_MSG_LIVE_STREAM_FINISHED = 4
 10 | TG_SERVICE_MSG_LIVE_STREAM_SHEDULED = 5
 11 | TG_SERVICE_MSG_PINNED = 6
 12 | 
 13 | # Document types:
 14 | TG_DOCUMENT_UNKNOWN = -1
 15 | TG_DOCUMENT_AUDIO = 0
 16 | 
 17 | class TgChannel():
 18 |     def __init__(self):
 19 |         self.url: str = ''
 20 |         self.avatar: str = ''
 21 |         self.name: str = ''
 22 |         self.display_name: str = ''
 23 | 
 24 | class TgChannelInfo(TgChannel):
 25 |     def __init__(self):
 26 |         TgChannel.__init__(self)
 27 |         self.subscribers: str = '' # like '73.2 k'
 28 |         self.photos: str = ''
 29 |         self.videos: str = ''
 30 |         self.links: str = ''
 31 |         self.description: str = ''
 32 |         self.has_preview: bool = None # can be parsed from links like 'https://t.me/channel_name'
 33 | 
 34 | class TgPostVoice():
 35 |     def __init__(self):
 36 |         self.url: str = ''
 37 |         self.data_waveform: str = ''
 38 |         self.data_ogg: str = '' # sometimes empty
 39 |         self.duration: str = '' # like '0:25'
 40 | 
 41 | class TgPostRoundedVideo():
 42 |     def __init__(self):
 43 |         self.url: str = ''
 44 |         self.thumbnail: str = ''
 45 |         self.duration: str = ''
 46 | 
 47 | class TgPostImage():
 48 |     def __init__(self):
 49 |         self.url: str = ''
 50 |         self.url_single: str = ''
 51 | 
 52 | class TgPostInvoice():
 53 |     def __init__(self):
 54 |         self.title: str = ''
 55 |         self.description: str = ''
 56 | 
 57 | class TgPostVideo(TgPostImage):
 58 |     def __init__(self):
 59 |         TgPostImage.__init__(self)
 60 |         self.image_url: str = '' # thumbnail
 61 | 
 62 | class TgPostReply():
 63 |     def __init__(self):
 64 |         self.author_name: str = ''
 65 |         self.url: str = ''
 66 |         self.image_url = ''    
 67 |         self.metatext: str = ''
 68 | 
 69 | class TgPostLinkPreview():
 70 |     def __init__(self):
 71 |         self.site_name: str = '' # like 'YouTube'
 72 |         self.url: str = ''
 73 |         self.title: str = ''
 74 |         self.description: str = ''
 75 |         self.image_url: str = ''
 76 | 
 77 | class TgSticker():
 78 |     def __init__(self):
 79 |         self.animated: bool = False
 80 |         self.image_url: str = ""
 81 |         self.video_url: str = ""
 82 | 
 83 | class TgPoll():
 84 | 
 85 |     class TgPollOption():
 86 |         def __init__(self):
 87 |             self.value: str = ''
 88 |             self.percents: int = -1
 89 | 
 90 |     def __init__(self):
 91 |         self.type: str = '' # Like 'Anonymous poll'
 92 |         self.question: str = ''
 93 |         self.options = [] # list of TgPollOption
 94 |         self.voters: str = '' # like '32.3k'
 95 | 
 96 | class TgEmoji():
 97 |     """Telegram emoji.
 98 |     id: Emoji id.
 99 |     custom: True if its a custom emoji.
100 |     animated: True if its animated emoji.
101 |     image_url: Original representation of emoji (also available for a custom).
102 |     custom_image_url: Custom representation of emoji.
103 |     data: Image svg+xml data.
104 |     tgs_url: link on .tgs file.
105 | 
106 |     """
107 |     def __init__(self):
108 |         self.id: int = -1 # Emoji id.
109 |         self.custom: bool = False
110 |         self.animated: bool = False
111 |         self.image_url: str = ''
112 |         self.custom_image_url: str = ''
113 |         self.data: str = '' # Image data as text.
114 |         self.tgs_url = '' # link on .tgs file.
115 | 
116 | class TgMessageEntity():
117 |     """Base class for all message entities.
118 |     See: https://core.telegram.org/api/entities
119 | 
120 |     offset: Offset in string.
121 |     length: Characters count.
122 |     """
123 |     def __init__(self, offset: int, length: int):
124 |         self.offset = offset # Offset in string
125 |         self.length = length # Characters count
126 | 
127 |     def same_place(self, entity) -> bool:
128 |         return (self.offset == entity.offset) and (self.length == entity.length)
129 | 
130 |     def starts_after(self, entity) -> bool:
131 |         """
132 |         Returns:
133 |             bool: True if current entity start position is bigger than end position of given entity.
134 |         """
135 |         return (self.offset >= (entity.offset + entity.length))
136 | 
137 |     def starts_inside(self, entity) -> bool:
138 |         """
139 |         Returns:
140 |             bool: True if current entity starts inside given entity.
141 |         """
142 |         return (not self.starts_after(entity)) and (self.offset >= entity.offset)
143 |     
144 | class TgMessageEntityUrl(TgMessageEntity):
145 |     """Message entity with text and url behind the text.
146 |     """
147 |     def __init__(self, offset: int = 0, length: int = 0, url: str = ''):
148 |         TgMessageEntity.__init__(self, offset, length)
149 |         # self.text: str = text
150 |         self.url: str = url
151 | 
152 | class TgMessageEntityEmoji(TgMessageEntity):
153 |     """Message entity with telegram emoji.
154 |     """
155 |     def __init__(self, offset: int = 0, length: int = 0):
156 |         TgMessageEntity.__init__(self, offset, length)
157 |         self.emoji: TgEmoji = None
158 | 
159 | class TgMessageEntityBold(TgMessageEntity):
160 |     """Message entity with bold text.
161 |     """
162 |     def __init__(self, offset: int = 0, length: int = 0):
163 |         TgMessageEntity.__init__(self, offset, length)
164 | 
165 | class TgMessageEntityItalic(TgMessageEntity):
166 |     """Message entity with italic text.
167 |     """
168 |     def __init__(self, offset: int = 0, length: int = 0):
169 |         TgMessageEntity.__init__(self, offset, length)
170 | 
171 | class TgMessageEntityStrikethrough(TgMessageEntity):
172 |     """Message entity with Strikethrough text.
173 |     """
174 |     def __init__(self, offset: int = 0, length: int = 0):
175 |         TgMessageEntity.__init__(self, offset, length)
176 | 
177 | class TgMessageEntityUnderlined(TgMessageEntity):
178 |     """Message entity with underlined text.
179 |     """
180 |     def __init__(self, offset: int = 0, length: int = 0):
181 |         TgMessageEntity.__init__(self, offset, length)
182 | 
183 | class TgMessageEntitySpoiler(TgMessageEntity):
184 |     """Message entity with hidden text.
185 |     """
186 |     def __init__(self, offset: int = 0, length: int = 0):
187 |         TgMessageEntity.__init__(self, offset, length)
188 | 
189 | class TgServiceMessage():
190 |     def __init__(self):
191 |         self.type: int = TG_SERVICE_MSG_UNKNOWN
192 |         self.extra: str = '' # (url, text) depends on type
193 | 
194 | class TgDocument():
195 |     def __init__(self):
196 |         self.type: int = TG_DOCUMENT_UNKNOWN
197 |         self.url: str = ''
198 |         self.title: str = ''
199 |         self.extra: str = ''
200 | 
201 | class TgPost():
202 |     def __init__(self):
203 |         self.url: str = ''
204 |         self.id: int = -1
205 |         # self.type: int = TG_MESSAGE
206 |         self.content: str = ''
207 |         self.entities: list[TgMessageEntity] = []
208 |         self.timestamp: datetime = datetime.now()
209 |         self.author: TgChannel = TgChannel()
210 |         self.views: str = '' # like '1.8k'
211 |         self.images: list[TgPostImage] = []
212 |         self.videos: list[TgPostVideo] = []
213 |         self.documents: list[TgDocument] = [] # list of attached files
214 |         self.voice: TgPostVoice = None
215 |         self.rounded_video: TgPostRoundedVideo = None
216 |         self.link_previews: list[TgPostLinkPreview] = []
217 |         self.has_not_supported: bool = False # Media is too big : VIEW IN TELEGRAM
218 |         self.forwarded_from: TgChannel = None
219 |         self.reply: TgPostReply = None
220 |         self.sticker: TgSticker = None
221 |         self.poll: TgPoll = None
222 |         self.invoice: TgPostInvoice = None
223 |         self.service_msg: TgServiceMessage = None
224 |         
225 |     def has_service_msg(self) -> bool:
226 |         return self.service_msg != None
227 | 
228 |     def has_forward(self) -> bool:
229 |         return self.forwarded_from != None
230 | 
231 |     def has_reply(self) -> bool:
232 |         return self.reply != None
233 | 
234 |     def has_sticker(self) -> bool:
235 |         return self.sticker != None
236 | 
237 |     def has_voice(self) -> bool:
238 |         return self.voice != None
239 | 
240 |     def has_rounded_video(self) -> bool:
241 |         return self.rounded_video != None
242 | 
243 |     def has_images(self) -> bool:
244 |         return len(self.images) > 0
245 | 
246 |     def has_videos(self) -> bool:
247 |         return len(self.videos) > 0
248 | 
249 |     def has_entities(self) -> bool:
250 |         return len(self.entities) > 0
251 | 
252 |     def has_link_previews(self) -> bool:
253 |         return len(self.link_previews) > 0
254 | 
255 |     def has_documents(self) -> bool:
256 |         return len(self.documents) > 0
257 | 
258 |     def has_poll(self) -> bool:
259 |         return self.poll != None
260 | 
261 |     def has_invoice(self) -> bool:
262 |         return self.invoice != None
263 | 
264 | class TgPostsPage():
265 |     def __init__(self):
266 |         self.posts: list[TgPost] = []
267 |         self.channel = TgChannelInfo() # channel info from right column on web page
268 | 


--------------------------------------------------------------------------------
/accless_tg_scraper/parser.py:
--------------------------------------------------------------------------------
  1 | from accless_tg_scraper.classes import *
  2 | from bs4 import BeautifulSoup
  3 | from datetime import datetime
  4 | from typing import Tuple
  5 | import re
  6 | 
  7 | TELEGRAM_WEB_URL = 'https://t.me'
  8 | 
  9 | def channel_name_from_url(url: str, base_url: str = TELEGRAM_WEB_URL+'/') -> str:
 10 |     base = url.find(base_url)
 11 |     if base != -1:
 12 |         res = url[base+len(base_url):]
 13 |         snslash = res[:2]
 14 |         res = res[2:] if snslash == 's/' else res            
 15 |         last = res.rfind('/')
 16 |         last = res.rfind('?') if last == -1 else last
 17 |         res = res[:last] if last != -1 else res      
 18 |         return res
 19 |     else:
 20 |         return url
 21 | 
 22 | def post_id_from_url(url: str, base_url: str = TELEGRAM_WEB_URL+'/') -> int:
 23 |     base_end = url.find(base_url)  
 24 |     if base_end != -1:
 25 |         base_end += len(base_url)
 26 | 
 27 |     next_slash = url.find('/', base_end+1)
 28 |     if next_slash != -1:
 29 |         res = url[next_slash+1:]
 30 |         params_sign = res.find('?')
 31 |         if params_sign != -1:
 32 |             res = res[:params_sign]
 33 |         return res
 34 |     return None # default
 35 | 
 36 | def parse_bg_image_url(style_str: str) -> str:
 37 |     return re.search("background-image:url\('(.*?)'\)", style_str).group(1)
 38 | 
 39 | def parse_emoji(element: BeautifulSoup) -> TgEmoji:
 40 |     res = TgEmoji()
 41 |     if 'emoji-id' in element.attrs:
 42 |         res.id = element['emoji-id'] # Emoji id.
 43 | 
 44 |     tg_emoji_wrap = element.find(class_='tg-emoji-wrap')
 45 |     if tg_emoji_wrap is not None:
 46 | 
 47 |         tg_emoji: BeautifulSoup = tg_emoji_wrap.find(class_='tg-emoji')
 48 |         if tg_emoji is not None:
 49 |             res.custom = True
 50 |             res.custom_image_url = tg_emoji['data-webp']
 51 | 
 52 |             if res.custom_image_url in ('', None):
 53 |                 # tgs file url.
 54 |                 source_tgs = tg_emoji.find(attrs={'type': 'application/x-tgsticker'})
 55 |                 res.tgs_url = source_tgs['srcset'] if source_tgs is not None else ''
 56 | 
 57 |                 # svg+xml data.
 58 |                 source_xml = tg_emoji.find(attrs={'type': 'image/svg+xml'})
 59 |                 res.data = source_xml['srcset'] if source_xml is not None else ''
 60 | 
 61 |     # Original enoji image url.
 62 |     emoji = element.find(class_='emoji')
 63 |     if emoji is not None:
 64 |         res.image_url = parse_bg_image_url(emoji['style'])
 65 | 
 66 |     return res
 67 | 
 68 | def parse_text_with_entities(element: BeautifulSoup) -> Tuple[str, list[TgMessageEntity]]:
 69 |     """
 70 |     Args:
 71 |         element (BeautifulSoup): element for parse text from.
 72 | 
 73 |     Returns:
 74 |         Tuple: first item is a full text, second item is a list of entities.
 75 |     """
 76 |     FIX_ISSUES = True
 77 |     entities = []
 78 |     full_text: str = ''
 79 | 
 80 |     def create_entity(subject: BeautifulSoup) -> TgMessageEntity:
 81 |         res = None
 82 |         if 'class' in subject.attrs:
 83 |             l_classes = subject.attrs['class']
 84 |         else:
 85 |             l_classes = []
 86 | 
 87 |         if (subject.name == 'tg-emoji') or ('emoji' in l_classes): # Telegram emoji.
 88 |             res = TgMessageEntityEmoji()
 89 |             res.emoji = parse_emoji(subject)
 90 |         elif (subject.name == 'b') or ('tgme_widget_service_strong_text' in l_classes): # Bold text.
 91 |             res = TgMessageEntityBold()
 92 |         elif subject.name == 'i': # Italic text.
 93 |             res = TgMessageEntityItalic()
 94 |         elif subject.name == 'a': # Hyperlink or user mention.
 95 |             res = TgMessageEntityUrl()
 96 |             res.url = subject['href']
 97 |         elif subject.name == 'u': # Underlined text.
 98 |             res = TgMessageEntityUnderlined()
 99 |         elif subject.name == 's': # Strikethrough text.
100 |             res = TgMessageEntityStrikethrough()
101 |         elif subject.name == 'tg-spoiler':
102 |             res = TgMessageEntitySpoiler()
103 |         # elif subject.name == '':
104 |         #     pass
105 |         return res
106 | 
107 |     def parse_entities(subject: BeautifulSoup, work_on_br: bool = True):
108 |         DISALLOW_EMPTY_ENTITIES = True
109 | 
110 |         nonlocal full_text
111 |         nonlocal entities
112 | 
113 |         for el in subject:
114 |             if el.name is not None: # Is not just a text.
115 |                 if (el.name == 'br'):
116 |                     # br tag must break line like on the web-page.
117 |                     if work_on_br:
118 |                         full_text += '\n'
119 |                 else:
120 |                     # Create entity.
121 |                     allow_entity = True
122 |                     current_offset: int = len(full_text)
123 |                     entity: TgMessageEntity = create_entity(el)
124 |                     if entity is not None:
125 |                         entity.offset = current_offset
126 | 
127 |                         parse_entities(el, True)
128 |                         entity.length = len(full_text) - entity.offset
129 | 
130 |                         # Fixing entities that starts or ends with whitespace.
131 |                         if FIX_ISSUES:
132 |                             s = full_text[entity.offset : entity.offset + entity.length]
133 |                             diff = entity.length - len(s.lstrip())
134 |                             rdiff = entity.length - len(s.rstrip())
135 |                             entity.offset += diff
136 |                             entity.length -= (diff + rdiff)
137 | 
138 |                         if DISALLOW_EMPTY_ENTITIES:
139 |                             if entity.length < 1:
140 |                                 allow_entity = False
141 | 
142 |                         if allow_entity:
143 |                             entities.append(entity)
144 |                     else:
145 |                         parse_entities(el, True)
146 |             else:
147 |                 full_text += el.text
148 | 
149 |     parse_entities(element)
150 | 
151 |     if FIX_ISSUES:
152 |         stop = len(entities)
153 |         for i in range(0, stop):
154 |             ent = entities[i]
155 | 
156 |             # Cleaning entities inside emojis.
157 |             if isinstance(ent, TgMessageEntityEmoji):
158 |                 for n in range(0, stop):
159 |                     e = entities[n]
160 |                     if (e is ent) or (e is None):
161 |                         continue
162 |                     if ent.same_place(e):
163 |                         if isinstance(e, TgMessageEntityEmoji):
164 |                             if ent.emoji.custom:
165 |                                 entities[n] = None
166 |                             else:
167 |                                 entities[i] = None
168 |                                 break
169 |                         elif isinstance(e, (TgMessageEntityBold, TgMessageEntityItalic)):
170 |                             entities[n] = None
171 | 
172 |     # cleaning null objects.
173 |     tmp_entities = []
174 |     for ent in entities:
175 |         if ent is not None:
176 |             tmp_entities.append(ent)
177 |     entities = tmp_entities
178 | 
179 |     return full_text, entities
180 | 
181 | def parse_channel_info(page: BeautifulSoup) -> TgChannelInfo:
182 |     res = TgChannelInfo()
183 |     tgme_page = page.find(class_='tgme_page')
184 |     # Avatar
185 |     photo = tgme_page.find(class_='tgme_page_photo')
186 |     res.avatar = photo.find('img')['src']
187 |     # Telegram username
188 |     res.name = photo.find('a')['href']
189 |     eq_sign = res.name.rfind('=')
190 |     res.name = res.name[eq_sign+1:]
191 |     # Url
192 |     res.url = f'{TELEGRAM_WEB_URL}/{res.name}'
193 |     # Display name
194 |     res.display_name = tgme_page.find(class_='tgme_page_title').find('span').get_text()
195 |     # Subscribers count
196 |     extra = tgme_page.find(class_='tgme_page_extra')
197 |     if not extra is None:
198 |         extra = extra.get_text()
199 |         s_pos = extra.find(' s')
200 |         res.subscribers = extra[:s_pos]
201 |     # Description
202 |     desc = tgme_page.find(class_='tgme_page_description')
203 |     if not desc is None:
204 |         res.description = desc.get_text()
205 |     preview_btn = tgme_page.find(class_='tgme_page_context_link')
206 |     res.has_preview = (not preview_btn is None) 
207 |     return res
208 | 
209 | def parse_right_column_channel_info(page: BeautifulSoup) -> TgChannelInfo:
210 |     res = TgChannelInfo()
211 |     res.has_preview = True
212 |     tgme_channel_info = page.find(class_="tgme_channel_info")
213 |     header = tgme_channel_info.find(class_='tgme_channel_info_header')
214 |     # Avatar
215 |     photo = header.find(class_='tgme_page_photo_image')
216 |     res.avatar = photo.find('img')['src']
217 |     title = header.find(class_='tgme_channel_info_header_title')
218 |     # Display name
219 |     res.display_name = header.find(class_='tgme_channel_info_header_title').find('span').get_text()
220 |     # Url
221 |     res.url = header.find(class_='tgme_channel_info_header_username').find('a')['href']
222 |     # Telegram username
223 |     res.name = channel_name_from_url(res.url)
224 |     # All counters (subscribers, photos, videos, links)
225 |     counters = tgme_channel_info.find(class_='tgme_channel_info_counters')
226 |     counters = counters.find_all(class_='tgme_channel_info_counter')
227 |     for counter in counters:
228 |         value = counter.find(class_='counter_value').get_text()
229 |         name = counter.find(class_='counter_type').get_text()
230 |         setattr(res, name, value)
231 |     # Description
232 |     desc = tgme_channel_info.find(class_='tgme_channel_info_description')
233 |     if not desc is None:
234 |         res.description = desc.get_text()    
235 |     return res
236 | 
237 | def parse_post_from_node(p: BeautifulSoup) -> TgPost:
238 |     new_post = TgPost()
239 |     tgme_widget_message = p.find(class_="tgme_widget_message", recursive=False)
240 |     new_post.url = f"{TELEGRAM_WEB_URL}/{tgme_widget_message['data-post']}"      
241 |     u = new_post.url
242 |     new_post.id = int(u[u.rfind('/')+1:])
243 | 
244 |     # Author
245 |     tgme_widget_message_user = p.find(class_="tgme_widget_message_user")
246 |     tgme_widget_message_user_photo = tgme_widget_message_user.find(class_="tgme_widget_message_user_photo")
247 |     try:
248 |         # Sometimes this url does not exist in the web page.
249 |         new_post.author.url = str(tgme_widget_message_user.find("a")["href"])
250 |     except:
251 |         pass
252 |     
253 |     new_post.author.avatar = str(tgme_widget_message_user_photo.find("img")["src"])
254 |     new_post.author.name = channel_name_from_url(new_post.author.url)
255 | 
256 |     # Author display_name
257 |     tgme_widget_message_owner_name = p.find(class_="tgme_widget_message_owner_name")
258 |     if not (tgme_widget_message_owner_name is None):
259 |         try:
260 |             span = tgme_widget_message_owner_name.find('span')
261 |             new_post.author.display_name = span.get_text()
262 |         except:
263 |             pass
264 | 
265 |     # Text content
266 |     tgme_widget_message_text = p.find_all(class_="tgme_widget_message_text")
267 |     if len(tgme_widget_message_text) > 0:
268 |         if len(tgme_widget_message_text) > 1:
269 |             message_text_elem = tgme_widget_message_text[1]
270 |         else:
271 |             message_text_elem = tgme_widget_message_text[0]
272 |         new_post.content, new_post.entities = parse_text_with_entities(message_text_elem)
273 | 
274 |     # Service message
275 |     service_message = p.find(class_='service_message')
276 |     if service_message is not None:
277 |         service_msg = TgServiceMessage()
278 |         if new_post.content.startswith('Live stream scheduled'):
279 |             service_msg.type = TG_SERVICE_MSG_LIVE_STREAM_SHEDULED
280 |         elif new_post.content.startswith('Live stream finished'):
281 |             service_msg.type = TG_SERVICE_MSG_LIVE_STREAM_FINISHED
282 |         elif new_post.content.startswith(f'{new_post.author.display_name} pinned'):
283 |             service_msg.type = TG_SERVICE_MSG_PINNED
284 |         elif new_post.content.startswith('Channel photo updated'):
285 |             service_msg.type = TG_SERVICE_MSG_CHANNEL_PHOTO_UPDATED
286 |             service_photo = p.find(class_='tgme_widget_message_service_photo')
287 |             if service_photo is not None:
288 |                 img = service_photo.find('img')
289 |                 service_msg.extra = img['src'] if img is not None else ''
290 |         elif new_post.content.startswith('Channel photo removed'):
291 |             service_msg.type = TG_SERVICE_MSG_CHANNEL_PHOTO_REMOVED
292 |         elif new_post.content.startswith('Channel name was changed to'):
293 |             service_msg.type = TG_SERVICE_MSG_CHANNEL_RENAMED
294 |         elif new_post.content.startswith('Channel created'):
295 |             service_msg.type = TG_SERVICE_MSG_CHANNEL_CREATED
296 |         else:
297 |             service_msg.type = TG_SERVICE_MSG_UNKNOWN
298 | 
299 |         if service_msg.type in (TG_SERVICE_MSG_CHANNEL_RENAMED, TG_SERVICE_MSG_PINNED):
300 |             strong_text = service_message.find(class_='tgme_widget_service_strong_text')
301 |             if strong_text is not None:
302 |                 service_msg.extra = strong_text.text
303 |         elif service_msg.type in (TG_SERVICE_MSG_LIVE_STREAM_FINISHED, TG_SERVICE_MSG_LIVE_STREAM_SHEDULED):
304 |             try:
305 |                 service_msg.extra = re.search("\((.*?)\)", new_post.content).group(1)
306 |             except:
307 |                 pass
308 | 
309 |         new_post.service_msg = service_msg
310 | 
311 |     # Reply info
312 |     tgme_widget_message_reply = p.find(class_="tgme_widget_message_reply")
313 |     if not tgme_widget_message_reply is None:
314 |         new_post.reply = TgPostReply()
315 |         new_post.reply.url = tgme_widget_message_reply['href']
316 |         new_post.reply.author_name = tgme_widget_message_reply.find(class_="tgme_widget_message_author_name").get_text()
317 |         try:
318 |             tgme_widget_message_metatext = tgme_widget_message_reply.find(class_="tgme_widget_message_metatext")
319 |             if not (tgme_widget_message_metatext is None):
320 |                 new_post.reply.metatext = tgme_widget_message_metatext.get_text()
321 |             style = tgme_widget_message_reply.find(class_="tgme_widget_message_reply_thumb")['style']
322 |             new_post.reply.image_url = parse_bg_image_url(style)
323 |         except:
324 |             pass
325 | 
326 |     # Forwarded from
327 |     try:
328 |         tgme_widget_message_forwarded_from_name = p.find(class_="tgme_widget_message_forwarded_from_name")
329 |         if not tgme_widget_message_forwarded_from_name is None:
330 |             new_post.forwarded_from = TgChannel()
331 |             new_post.forwarded_from.name = tgme_widget_message_forwarded_from_name.find('span').get_text()
332 |             new_post.forwarded_from.url = tgme_widget_message_forwarded_from_name['href']
333 |     except:
334 |         pass
335 | 
336 |     # Rounded video
337 |     rounded_vid = p.find(class_="tgme_widget_message_roundvideo_player")
338 |     if not (rounded_vid is None):
339 |         new_post.rounded_video = TgPostRoundedVideo()
340 |         thumb = rounded_vid.find(class_='tgme_widget_message_roundvideo_thumb')
341 |         new_post.rounded_video.thumbnail = parse_bg_image_url(thumb['style'])
342 |         vid = rounded_vid.find(class_='tgme_widget_message_roundvideo')
343 |         new_post.rounded_video.url = vid['src']
344 |         duration = rounded_vid.find(class_='tgme_widget_message_roundvideo_duration')
345 |         new_post.rounded_video.duration = duration.get_text()
346 | 
347 |     # Voice
348 |     voice_player = p.find(class_='tgme_widget_message_voice_player')
349 |     if not (voice_player is None):
350 |         voice = voice_player.find(class_="tgme_widget_message_voice") 
351 |         if not (voice is None):
352 |             new_post.voice = TgPostVoice()
353 |             new_post.voice.url = voice['src']
354 |             try:
355 |                 duration = voice_player.find(class_="tgme_widget_message_voice_duration")
356 |                 new_post.voice.duration = duration.get_text()
357 |                 new_post.voice.data_waveform = voice['data-waveform']
358 |                 new_post.voice.data_ogg = voice['data-ogg']
359 |             except:
360 |                 pass
361 | 
362 |     # Images
363 |     images = p.find_all(class_="tgme_widget_message_photo_wrap")
364 |     for image in images:
365 |         new_image = TgPostImage()
366 |         style = image["style"]
367 |         new_image.url = parse_bg_image_url(style)
368 |         new_image.url_single = image["href"]
369 |         new_post.images.append(new_image)
370 | 
371 |     # Supported videos
372 |     videos = p.find_all(class_="tgme_widget_message_video_player")
373 |     for vid in videos:
374 |         new_video = TgPostVideo()
375 |         style = ''
376 |         thumb = vid.find(class_="tgme_widget_message_video_thumb")
377 |         if thumb is not None and 'style' in thumb.attrs:
378 |             style = thumb['style']
379 | 
380 |         try:
381 |             new_video.image_url = parse_bg_image_url(style)
382 |         except:
383 |             pass
384 | 
385 |         try:
386 |             new_video.url = vid.find(class_="tgme_widget_message_video")['src']
387 |         except:
388 |             pass
389 | 
390 |         new_video.url_single = vid['href']
391 |         new_post.videos.append(new_video)
392 | 
393 |     # Link previews
394 |     link_previews = p.find_all(class_="tgme_widget_message_link_preview")
395 |     for prev in link_previews:
396 |         new_prev = TgPostLinkPreview()
397 |         new_prev.url = prev["href"]
398 | 
399 |         try:
400 |             thumb = prev.find(class_="link_preview_image")
401 |             if thumb is None:
402 |                 thumb = prev.find(class_="link_preview_right_image")
403 |             if not thumb is None:
404 |                 style = thumb['style']
405 |                 new_prev.image_url = parse_bg_image_url(style)
406 | 
407 |             new_prev.title = prev.find(class_="link_preview_title").get_text()
408 |             new_prev.description = prev.find(class_="link_preview_description").get_text()
409 |         except:
410 |             pass
411 | 
412 |         link_preview_site_name = prev.find(class_='link_preview_site_name')
413 |         if link_preview_site_name is not None:
414 |             new_prev.site_name = link_preview_site_name.get_text() 
415 |         new_post.link_previews.append(new_prev)
416 | 
417 |     # Documents
418 |     docs = p.find_all(class_='tgme_widget_message_document_wrap', recursive=True)
419 |     for doc in docs:
420 |         new_doc = TgDocument()
421 |         tmp_obj = doc.find(class_='audio')
422 |         if tmp_obj is not None:
423 |             new_doc.type = TG_DOCUMENT_AUDIO
424 |         else:
425 |             new_doc.type = TG_DOCUMENT_UNKNOWN
426 | 
427 |         new_doc.url = doc['href'] if 'href' in doc.attrs else ''
428 | 
429 |         title = doc.find(class_='tgme_widget_message_document_title')
430 |         new_doc.title = title.text if title is not None else ''
431 | 
432 |         extra = doc.find(class_='tgme_widget_message_document_extra')
433 |         new_doc.extra = extra.text if extra is not None else ''
434 | 
435 |         new_post.documents.append(new_doc)
436 | 
437 |     # Views
438 |     tgme_widget_message_views = p.find(class_="tgme_widget_message_views")
439 |     if not tgme_widget_message_views is None:
440 |         new_post.views = str(tgme_widget_message_views.get_text())
441 | 
442 |     # Timestamp
443 |     tgme_widget_message_date = p.find(class_="tgme_widget_message_date")
444 |     time = tgme_widget_message_date.find("time")
445 |     new_post.timestamp = datetime.fromisoformat(time["datetime"])
446 | 
447 |     # Sticker
448 |     tgme_widget_message_sticker_wrap = p.find(class_='tgme_widget_message_sticker_wrap')
449 |     if not tgme_widget_message_sticker_wrap is None:
450 |         new_post.sticker = TgSticker()
451 | 
452 |         # static sticker
453 |         tgme_widget_message_sticker = p.find(class_='tgme_widget_message_sticker')
454 |         if not tgme_widget_message_sticker is None:
455 |             if 'data-webp' in tgme_widget_message_sticker.attrs:
456 |                 new_post.sticker.image_url = tgme_widget_message_sticker['data-webp']
457 |             elif 'style' in tgme_widget_message_sticker.attrs:
458 |                 new_post.sticker.image_url = parse_bg_image_url(tgme_widget_message_sticker['style'])
459 | 
460 |         # Animated sticker
461 |         tgme_widget_message_videosticker = p.find(class_='tgme_widget_message_videosticker')
462 |         if not tgme_widget_message_videosticker is None:
463 |             new_post.sticker.animated = True
464 | 
465 |             js_videosticker_video = tgme_widget_message_videosticker.find(class_='js-videosticker_video')
466 |             if js_videosticker_video is not None:
467 |                 new_post.sticker.video_url = js_videosticker_video['src']
468 | 
469 |             webm_sticker_done = js_videosticker_video.find(class_='webm_sticker_done')
470 |             if webm_sticker_done is not None:
471 |                 new_post.sticker.image_url = webm_sticker_done['src']
472 | 
473 |             if new_post.sticker.image_url == '':
474 |                 img = js_videosticker_video.find('img')
475 |                 if img is not None:
476 |                     new_post.sticker.image_url = img['src']
477 | 
478 |     # Detect unsupported media
479 |     message_media_not_supported_wrap = p.find(class_="message_media_not_supported_wrap")
480 |     if not (message_media_not_supported_wrap is None):
481 |         message_media_not_supported_label = message_media_not_supported_wrap.find(class_="message_media_not_supported_label")
482 |         if not (message_media_not_supported_label is None):
483 |             not_support_msg = message_media_not_supported_label.get_text()
484 |             new_post.has_not_supported = (not_support_msg.find('in your browser') == -1)
485 |         else:
486 |             new_post.has_not_supported = True
487 | 
488 |     # Poll
489 |     tgme_widget_message_poll = p.find(class_="tgme_widget_message_poll")
490 |     if not (tgme_widget_message_poll is None):
491 |         try:
492 |             new_post.poll = TgPoll()
493 | 
494 |             # Question
495 |             question = tgme_widget_message_poll.find(class_='tgme_widget_message_poll_question')
496 |             new_post.poll.question = question.get_text()
497 | 
498 |             # Poll type
499 |             tgme_widget_message_poll_type = tgme_widget_message_poll.find(class_="tgme_widget_message_poll_type")
500 |             new_post.poll.type = tgme_widget_message_poll_type.get_text()
501 | 
502 |             # Options
503 |             tgme_widget_message_poll_options = tgme_widget_message_poll.find(class_='tgme_widget_message_poll_options')  
504 |             options = tgme_widget_message_poll_options.find_all(class_='tgme_widget_message_poll_option')
505 |             for opt in options:
506 |                 new_opt = TgPoll.TgPollOption()
507 |                 new_opt.value = opt.find(class_="tgme_widget_message_poll_option_value").get_text().strip()
508 |                 percents = opt.find(class_='tgme_widget_message_poll_option_percent')
509 |                 if not (percents is None):
510 |                     percents = percents.get_text()
511 |                     percents = percents[:len(percents)-1]
512 |                     new_opt.percents = int(percents)
513 |                     new_post.poll.options.append(new_opt)
514 | 
515 |             # Voters count        
516 |             voters = p.find(class_="tgme_widget_message_voters") 
517 |             if not (voters is None):
518 |                 new_post.poll.voters = voters.get_text()
519 |             else:
520 |                 voters = p.find(class_="tgme_widget_message_poll_votes")
521 |                 voters = voters.get_text()
522 |                 space_pos = voters.rfind(' ')
523 |                 if space_pos != -1:
524 |                     voters = voters[:space_pos-1]
525 |                 new_post.poll.voters = voters
526 |         except:
527 |             pass
528 | 
529 |     # Invoice
530 |     tgme_widget_message_invoice = p.find(class_="tgme_widget_message_invoice")
531 |     if not (tgme_widget_message_invoice is None):
532 |         new_post.invoice = TgPostInvoice()
533 |         title = tgme_widget_message_invoice.find(class_='tgme_widget_message_invoice_title')
534 |         new_post.invoice.title = title.get_text()
535 |         desc = tgme_widget_message_invoice.find(class_='tgme_widget_message_invoice_description')
536 |         new_post.invoice.description = desc.get_text()
537 | 
538 |     return new_post
539 | 
540 | def parse_widget_post(page: BeautifulSoup) -> TgPost:
541 |     p = page.find(class_="widget_frame_base")
542 |     return parse_post_from_node(p)
543 | 
544 | def parse_posts(page: BeautifulSoup) -> []:
545 |     history = page.find(class_="tgme_channel_history")
546 |     p_posts = history.find_all(class_="tgme_widget_message_wrap", recursive=False)
547 |     posts = []
548 | 
549 |     for p in p_posts:
550 |         new_post = parse_post_from_node(p)    
551 |         posts.append(new_post)
552 | 
553 |     return posts
554 | 
555 | def parse_posts_page(page: BeautifulSoup):
556 |     res = TgPostsPage()
557 |     res.posts = parse_posts(page)
558 |     res.channel = parse_right_column_channel_info(page)
559 |     return res
560 | 


--------------------------------------------------------------------------------