├── readme.md
├── youtube-community-tab
    ├── .flake8
    ├── .gitignore
    ├── LICENSE
    ├── README.md
    ├── pyproject.toml
    ├── setup.cfg
    ├── setup.py
    ├── src
    │   └── youtube_community_tab
    │   │   ├── __init__.py
    │   │   ├── comment.py
    │   │   ├── community_tab.py
    │   │   ├── helpers
    │   │       ├── __init__.py
    │   │       ├── clean_items.py
    │   │       └── utils.py
    │   │   ├── post.py
    │   │   ├── reply.py
    │   │   └── requests_handler.py
    └── tests
    │   ├── test_actions.py
    │   ├── test_community_tab.py
    │   ├── test_membership.py
    │   └── test_post.py
└── ytct.py


/readme.md:
--------------------------------------------------------------------------------
 1 | # YouTube Community Tab
 2 | 
 3 | This repo includes a fork of [bot-jonas/youtube-community-tab](https://github.com/bot-jonas/youtube-community-tab), as well as a script to scrape and dump community tab posts as .json files, along with all attached images and thumbnails.
 4 | 
 5 | ## Setup / Update
 6 | 
 7 | Since this version of the youtube-community-tab package is slightly modified, you will need to install/update it from this repo to guarantee compatibility.
 8 | ```sh
 9 | cd youtube-community-tab
10 | pip install .
11 | ```
12 | 
13 | ## Example Usage
14 | 
15 | ```sh
16 | python ytct.py --cookies cookies-youtube-com.txt -d "./Ninomae Ina_nis Ch. hololive-EN" https://www.youtube.com/channel/UCMwGHR0BTZuLsmjY_NT5Pwg/community
17 | OR
18 | ./ytct.py --cookies cookies-youtube-com.txt -d "./Ninomae Ina_nis Ch. hololive-EN" https://www.youtube.com/@NinomaeInanis/community
19 | ```
20 | 
21 | ## Arguments
22 | 
23 | ```
24 | -h, --help                    show this help message and exit
25 | --cookies COOKIES FILE        a Netscape format cookies file, allows the script to
26 |                               retrieve Membership-only posts
27 | -d, --directory DIRECTORY     save directory (defaults to current)
28 | --post-archive FILE           download only posts not listed in the archive file
29 |                               and record the IDs of newly downloaded posts
30 | --dates                       write information about the post publish date
31 | -r, --reverse                 set download order from oldest to newest post.
32 | ```
33 | 


--------------------------------------------------------------------------------
/youtube-community-tab/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 200
3 | max-complexity = 18


--------------------------------------------------------------------------------
/youtube-community-tab/.gitignore:
--------------------------------------------------------------------------------
1 | */__pycache__/*
2 | */*.egg-info/*
3 | *.sqlite
4 | build/*
5 | dist/*
6 | tests/cookies.txt
7 | 


--------------------------------------------------------------------------------
/youtube-community-tab/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Jonas Alves
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/youtube-community-tab/README.md:
--------------------------------------------------------------------------------
  1 | # youtube_community_tab
  2 | 
  3 | Python3 interface to YouTube community tab, it handles posts, comments and comment replies.
  4 | 
  5 | This is a fork from [bot-jonas/youtube-community-tab](https://github.com/bot-jonas/youtube-community-tab) which aims to return more comprehensive objects with JSON and support cookies for posts limited to memberships.
  6 | 
  7 | ## Community Tab
  8 | 
  9 | ```python
 10 | from youtube_community_tab.community_tab import CommunityTab
 11 | import json
 12 | 
 13 | 
 14 | def indent_print(text, level=1):
 15 |     indent = level * "\t"
 16 |     print(indent + ("\n" + indent).join(text.split("\n")))
 17 | 
 18 | 
 19 | # Cache expiration
 20 | EXPIRATION_TIME = 1 * 60 * 60
 21 | 
 22 | ct = CommunityTab("vsauce1")
 23 | 
 24 | # Load initial posts
 25 | ct.load_posts(expire_after=EXPIRATION_TIME)
 26 | 
 27 | # Load more posts
 28 | while(ct.posts_continuation_token and len(ct.posts) < 40):
 29 |     ct.load_posts(expire_after=EXPIRATION_TIME)
 30 |   
 31 | post = ct.posts[0]
 32 | print(f"[Post {post.post_id}]")
 33 | indent_print(post.get_text())
 34 | 
 35 | print("\n[Thumbnails]")
 36 | print(json.dumps(post.get_thumbnails()[0], indent=4))
 37 | 
 38 | # Load initial comments
 39 | post.load_comments(expire_after=EXPIRATION_TIME)
 40 | 
 41 | # Load more comments
 42 | while(post.comments_continuation_token and len(post.comments) < 100):
 43 |     post.load_comments(expire_after=EXPIRATION_TIME)
 44 |   
 45 | comment = post.comments[1]
 46 | print(f"\n[Comment {comment.comment_id}]")
 47 | indent_print(comment.get_text())
 48 | 
 49 | # Load initial comment replies
 50 | comment.load_replies(expire_after=EXPIRATION_TIME)
 51 | 
 52 | # Load more comment replies
 53 | while(comment.replies_continuation_token and len(comment.replies) < 10):
 54 |     comment.load_replies(expire_after=EXPIRATION_TIME)
 55 |   
 56 | reply = comment.replies[0]
 57 | print(f"\n[Reply {reply.reply_id}]")
 58 | indent_print(reply.get_text())
 59 | 
 60 | ```
 61 | 
 62 | Output:
 63 | 
 64 | ```
 65 | [Post UgkxzeM19x_He9LEoerdLOHwZJsqIwamUnTj]
 66 |         THANK YOU!
 67 | 
 68 |         WE RAISED $20,180 for the Alzheimer's Association!!!
 69 |         The winner of this beautiful cube of my beard hairs will be announced November 15th!!
 70 | 
 71 |         As you all know, we also donate a portion of all proceeds from the Curiosity Box to Alzheimer's research; and there's never been a better time to do a favor for your brain and everyone else's:
 72 | 
 73 |         RIGHT NOW: subscribe with code "BEST" and I'll send you our newest box *and* throw in our BEST-OF BOX completely FREE!!!
 74 | 
 75 | 
 76 |         https://www.curiositybox.com
 77 | 
 78 | [Thumbnails]
 79 | [
 80 |     {
 81 |         "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s288-c-fcrop64=1,1e6d0000e38bffff-nd-v1",
 82 |         "width": 288,
 83 |         "height": 288
 84 |     },
 85 |     {
 86 |         "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s400-c-fcrop64=1,1e6d0000e38bffff-nd-v1",
 87 |         "width": 400,
 88 |         "height": 400
 89 |     },
 90 |     {
 91 |         "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s462-c-fcrop64=1,1e6d0000e38bffff-nd-v1",
 92 |         "width": 462,
 93 |         "height": 462
 94 |     }
 95 | ]
 96 | 
 97 | [Comment UgyTIomDXMuKf3NTo294AaABAg]
 98 |         Thank you for doing this. Both my grandparents are affected by alzheimer's disease. It is difficult to watch a highly creative woman and an electrical engineer fade away.
 99 | 
100 | [Reply UgyTIomDXMuKf3NTo294AaABAg.9TtQ3j7qvll9TtqSmVNrJu]
101 |         Hey a heart
102 | ```
103 | 
104 | ## Post
105 | 
106 | ```python
107 | from youtube_community_tab.post import Post
108 | import json
109 | 
110 | 
111 | def indent_print(text, level=1):
112 |     indent = level * "\t"
113 |     print(indent + ("\n" + indent).join(text.split("\n")))
114 | 
115 | 
116 | # Cache expiration
117 | EXPIRATION_TIME = 1 * 60 * 60
118 |   
119 | post = Post.from_post_id("UgkxzeM19x_He9LEoerdLOHwZJsqIwamUnTj")
120 | print(f"[Post {post.post_id}]")
121 | indent_print(post.get_text())
122 | 
123 | print("\n[Thumbnails]")
124 | print(json.dumps(post.get_thumbnails()[0], indent=4))
125 | 
126 | # Load initial comments
127 | post.load_comments(expire_after=EXPIRATION_TIME)
128 | 
129 | # Load more comments
130 | while(post.comments_continuation_token and len(post.comments) < 100):
131 |     post.load_comments(expire_after=EXPIRATION_TIME)
132 |   
133 | comment = post.comments[1]
134 | print(f"\n[Comment {comment.comment_id}]")
135 | indent_print(comment.get_text())
136 | 
137 | # Load initial comment replies
138 | comment.load_replies(expire_after=EXPIRATION_TIME)
139 | 
140 | # Load more comment replies
141 | while(comment.replies_continuation_token and len(comment.replies) < 10):
142 |     comment.load_replies(expire_after=EXPIRATION_TIME)
143 |   
144 | reply = comment.replies[0]
145 | print(f"\n[Reply {reply.reply_id}]")
146 | indent_print(reply.get_text())
147 | 
148 | ```
149 | 
150 | Output:
151 | ```
152 | [Post UgkxzeM19x_He9LEoerdLOHwZJsqIwamUnTj]
153 |         THANK YOU!
154 | 
155 |         WE RAISED $20,180 for the Alzheimer's Association!!!
156 |         The winner of this beautiful cube of my beard hairs will be announced November 15th!!
157 | 
158 |         As you all know, we also donate a portion of all proceeds from the Curiosity Box to Alzheimer's research; and there's never been a better time to do a favor for your brain and everyone else's:
159 | 
160 |         RIGHT NOW: subscribe with code "BEST" and I'll send you our newest box *and* throw in our BEST-OF BOX completely FREE!!!
161 | 
162 | 
163 |         https://www.curiositybox.com
164 | 
165 | [Thumbnails]
166 | [
167 |     {
168 |         "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s288-c-fcrop64=1,1e6d0000e38bffff-nd-v1",
169 |         "width": 288,
170 |         "height": 288
171 |     },
172 |     {
173 |         "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s400-c-fcrop64=1,1e6d0000e38bffff-nd-v1",
174 |         "width": 400,
175 |         "height": 400
176 |     },
177 |     {
178 |         "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s462-c-fcrop64=1,1e6d0000e38bffff-nd-v1",
179 |         "width": 462,
180 |         "height": 462
181 |     }
182 | ]
183 | 
184 | [Comment UgyTIomDXMuKf3NTo294AaABAg]
185 |         Thank you for doing this. Both my grandparents are affected by alzheimer's disease. It is difficult to watch a highly creative woman and an electrical engineer fade away.
186 | 
187 | [Reply UgyTIomDXMuKf3NTo294AaABAg.9TtQ3j7qvll9TtqSmVNrJu]
188 |         Hey a heart
189 | ```
190 | 
191 | ## Authentication/Membership
192 | 
193 | To access authenticated posts, like membership only posts, you need to provide cookies to authenticate your requests.
194 | 
195 | ```python
196 | from http import cookiejar
197 | from youtube_community_tab.requests_handler import requests_cache
198 | from youtube_community_tab.community_tab import CommunityTab
199 | 
200 | cookie_jar = cookiejar.MozillaCookieJar("cookies.txt")
201 | cookie_jar.load()
202 | requests_cache.cookies = cookie_jar
203 | 
204 | ct = CommunityTab("UCMwGHR0BTZuLsmjY_NT5Pwg")
205 | ct.load_posts()
206 | 
207 | membership_post = None
208 | while ct.posts_continuation_token:
209 |   for post in ct.posts:
210 |     if post.sponsor_only_badge is not None:
211 |       membership_post = post
212 |       break
213 | 
214 |   if(membership_post is not None):
215 |       break
216 | 
217 |   ct.load_posts(expire_after=EXPIRATION_TIME)
218 | 
219 | assert(membership_post is not None)
220 | ```
221 | 


--------------------------------------------------------------------------------
/youtube-community-tab/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 160


--------------------------------------------------------------------------------
/youtube-community-tab/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = youtube_community_tab
 3 | version = 0.2.2.1
 4 | description = A python3 module to handle YouTube Community Tab
 5 | 
 6 | [options]
 7 | package_dir =
 8 | 	=src
 9 | packages = find:
10 | install_requires =
11 | 	requests_cache
12 | python_requires = >=3.7
13 | 
14 | [options.packages.find]
15 | where=src
16 | 


--------------------------------------------------------------------------------
/youtube-community-tab/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | here = os.path.abspath(os.path.dirname(__file__))
 5 | 
 6 | with open(os.path.join(here, "README.md"), "r") as f:
 7 |     long_description = f.read()
 8 | 
 9 | setup(
10 |     name="youtube_community_tab",
11 |     version="0.2.3.1",
12 |     description="A python3 module to handle YouTube Community Tab",
13 |     long_description_content_type="text/markdown",
14 |     long_description=long_description,
15 |     url="https://github.com/CetaceanNation/youtube-community-tab",
16 |     package_dir={"": "src"},
17 |     install_requires=[
18 |         "requests_cache",
19 |     ],
20 |     packages=find_packages(where="src"),
21 |     zip_safe=False,
22 | )
23 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import (
 2 |     helpers
 3 | )
 4 | 
 5 | from .comment import Comment
 6 | from .community_tab import CommunityTab
 7 | from .post import Post
 8 | from .reply import Reply
 9 | from .requests_handler import requests_cache
10 | 
11 | __all__ = [
12 |     "helpers",
13 |     "Comment",
14 |     "CommunityTab",
15 |     "Post",
16 |     "Reply",
17 |     "requests_cache"
18 | ]
19 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/comment.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from requests.utils import dict_from_cookiejar
  3 | from base64 import urlsafe_b64encode
  4 | 
  5 | from .requests_handler import requests_cache
  6 | from .helpers.utils import safely_get_value_from_key, get_auth_header, CLIENT_VERSION
  7 | from .reply import Reply
  8 | 
  9 | 
 10 | class Comment(object):
 11 |     FORMAT_URLS = {
 12 |         "POST": "https://www.youtube.com/post/{}",
 13 |         # HARD_CODED: This key seems to be constant to everyone, IDK
 14 |         "BROWSE_ENDPOINT": "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
 15 |         "UPDATE_COMMENT_ENDPOINT": "https://www.youtube.com/youtubei/v1/comment/update_comment?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false",
 16 |         "PERFORM_COMMENT_ACTION_ENDPOINT": "https://www.youtube.com/youtubei/v1/comment/perform_comment_action?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false",
 17 |         "FIXED_COMMENT": "https://www.youtube.com/channel/{}/community?lc={}&lb={}",
 18 |     }
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         post_id,
 23 |         comment_id,
 24 |         channel_id=None,
 25 |         author=None,
 26 |         content_text=None,
 27 |         vote_count=None,
 28 |         replies_continuation_token=None,
 29 |         click_tracking_params=None,
 30 |         visitor_data=None,
 31 |         session_index="0"
 32 |     ):
 33 |         self.post_id = post_id
 34 |         self.comment_id = comment_id
 35 |         self.channel_id = channel_id
 36 |         self.author = author
 37 |         self.content_text = content_text
 38 |         self.vote_count = vote_count
 39 |         self.replies_continuation_token = replies_continuation_token
 40 |         self.click_tracking_params = click_tracking_params
 41 |         self.visitor_data = visitor_data
 42 |         self.session_index = session_index
 43 |         self.replies = []
 44 | 
 45 |     def as_json(self):
 46 |         return {
 47 |             "comment_id": self.comment_id,
 48 |             "post_id": self.post_id,
 49 |             "channel_id": self.channel_id,
 50 |             "author": self.author,
 51 |             "content_text": self.content_text,
 52 |             "vote_count": self.vote_count
 53 |         }
 54 | 
 55 |     def __str__(self):
 56 |         return json.dumps(self.as_json(), indent=4)
 57 | 
 58 |     def __repr__(self):
 59 |         return self.__str__()
 60 | 
 61 |     def get_text(self):
 62 |         if self.content_text is not None:
 63 |             return "".join([run["text"] for run in self.content_text["runs"]])
 64 |         return None
 65 | 
 66 |     def load_replies(self, expire_after=0):
 67 |         headers = {
 68 |             "Accept-Language": "en-US,en;q=0.9",
 69 |             "x-origin": "https://www.youtube.com",
 70 |             "Referer": Comment.FORMAT_URLS["POST"].format(self.post_id)
 71 |         }
 72 | 
 73 |         # Add authorization header
 74 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
 75 |         if "SAPISID" in current_cookies:
 76 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
 77 | 
 78 |         if self.replies_continuation_token:
 79 |             headers.update(
 80 |                 {
 81 |                     "X-Goog-AuthUser": self.session_index,
 82 |                     "X-Origin": "https://www.youtube.com",
 83 |                     "X-Youtube-Client-Name": "1",
 84 |                     "X-Youtube-Client-Version": CLIENT_VERSION
 85 |                 }
 86 |             )
 87 | 
 88 |             json_body = {
 89 |                 "context": {
 90 |                     "client": {
 91 |                         "clientName": "WEB",
 92 |                         "clientVersion": CLIENT_VERSION,
 93 |                         "originalUrl": Comment.FORMAT_URLS["POST"].format(self.post_id),
 94 |                         "visitorData": self.visitor_data
 95 |                     }
 96 |                 },
 97 |                 "continuation": self.replies_continuation_token,
 98 |                 "clickTracking": {"clickTrackingParams": self.click_tracking_params}
 99 |             }
100 | 
101 |             r = requests_cache.post(Comment.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers)
102 | 
103 |             data = r.json()
104 |             append = data["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"]
105 |             self.click_tracking_params = data["trackingParams"]
106 |             continuation_items = safely_get_value_from_key(append, "continuationItems", default=[])
107 | 
108 |             self.append_replies_from_items(continuation_items)
109 | 
110 |     def append_replies_from_items(self, items):
111 |         there_is_no_continuation_token = True
112 |         for item in items:
113 |             kind = list(item.keys())[0]
114 | 
115 |             if kind == "commentRenderer":
116 |                 self.replies.append(Reply.from_data(item[kind]))
117 |             elif kind == "continuationItemRenderer":
118 |                 if "continuationEndpoint" in item[kind]:
119 |                     self.replies_continuation_token = item[kind]["continuationEndpoint"]["continuationCommand"]["token"]
120 |                     there_is_no_continuation_token = False
121 |                 elif "button" in item[kind]:
122 |                     self.replies_continuation_token = item[kind]["button"]["buttonRenderer"]["command"]["continuationCommand"]["token"]
123 |                     there_is_no_continuation_token = False
124 | 
125 |         if there_is_no_continuation_token:
126 |             self.replies_continuation_token = False
127 | 
128 |     @staticmethod
129 |     def from_data(data, post_id, channel_id, replies_continuation_token, click_tracking_params, visitor_data, session_index):
130 |         comment = Comment(
131 |             post_id,
132 |             data["commentId"],
133 |             channel_id=channel_id,
134 |             content_text=safely_get_value_from_key(data, "contentText"),
135 |             author={
136 |                 "authorText": safely_get_value_from_key(data, "authorText"),
137 |                 "authorThumbnail": safely_get_value_from_key(data, "authorThumbnail"),
138 |                 "authorEndpoint": safely_get_value_from_key(data, "authorEndpoint", "browseEndpoint"),
139 |                 "authorIsChannelOwner": safely_get_value_from_key(data, "authorIsChannelOwner"),
140 |                 "sponsorCommentBadge": safely_get_value_from_key(data, "sponsorCommentBadge"),
141 |             },
142 |             vote_count=safely_get_value_from_key(data, "voteCount"),
143 |             replies_continuation_token=replies_continuation_token,
144 |             click_tracking_params=click_tracking_params,
145 |             visitor_data=visitor_data,
146 |             session_index=session_index
147 |         )
148 | 
149 |         comment.raw_data = data
150 | 
151 |         return comment
152 | 
153 |     @staticmethod
154 |     def get_fixed_comment_params(comment_id, post_id, channel_id):
155 |         part1 = [
156 |             b"\x12\tcommunity\xB8\x01\x00\xCA\x01",
157 |             (32 + len(post_id)).to_bytes(1, "big"),
158 |             b"\x82\x01",
159 |             len(comment_id).to_bytes(1, "big"),
160 |             comment_id.encode(),
161 |             b"\xB2\x01",
162 |             len(post_id).to_bytes(1, "big"),
163 |             post_id.encode(),
164 |             b"\xEA\x02\x04\x10\x01\x18\x01\xAA\x03",
165 |             (84 + len(post_id)).to_bytes(1, "big"),
166 |             b"\x22",
167 |             (64 + len(post_id)).to_bytes(1, "big"),
168 |             b"0\x00\x82\x01",
169 |             len(comment_id).to_bytes(1, "big"),
170 |             comment_id.encode(),
171 |             b"\xD8\x01\x01\xEA\x01",
172 |             len(post_id).to_bytes(1, "big"),
173 |             post_id.encode(),
174 |             b"\xF2\x01",
175 |             len(channel_id).to_bytes(1, "big"),
176 |             channel_id.encode(),
177 |             b"B\x10comments-section"
178 |         ]
179 | 
180 |         part1 = urlsafe_b64encode(b"".join(part1)).replace(b"=", b"%3D")
181 | 
182 |         params = [
183 |             b"\xe2\xa9\x85\xb2\x02",
184 |             (83 + 3 * len(post_id)).to_bytes(1, "big"),
185 |             b"\x02\x12",
186 |             len(channel_id).to_bytes(1, "big"),
187 |             channel_id.encode(),
188 |             b"\x1A",
189 |             (54 + 3 * len(post_id)).to_bytes(1, "big"),
190 |             b"\x02",
191 |             part1
192 |         ]
193 | 
194 |         params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D")
195 | 
196 |         return params
197 | 
198 |     @staticmethod
199 |     def from_ids(comment_id, post_id, channel_id, expire_after=0):
200 |         fixed_comment_url = Comment.FORMAT_URLS["FIXED_COMMENT"].format(channel_id, comment_id, post_id)
201 |         headers = {
202 |             "Accept-Language": "en-US,en;q=0.9",
203 |             "x-origin": "https://www.youtube.com",
204 |             "Referer": fixed_comment_url
205 |         }
206 | 
207 |         # Add authorization header
208 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
209 |         if "SAPISID" in current_cookies:
210 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
211 | 
212 |         c = Comment.get_fixed_comment_params(comment_id, post_id, channel_id)
213 | 
214 |         json_body = {
215 |             "context": {
216 |                 "client": {
217 |                     "clientName": "WEB",
218 |                     "clientVersion": CLIENT_VERSION,
219 |                     "originalUrl": fixed_comment_url,
220 |                 }
221 |             },
222 |             "continuation": c
223 |         }
224 | 
225 |         r = requests_cache.post(Comment.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers)
226 | 
227 |         comment_data = safely_get_value_from_key(
228 |             r.json(), "onResponseReceivedEndpoints", 1, "reloadContinuationItemsCommand", "continuationItems", 0, "commentThreadRenderer"
229 |         )
230 | 
231 |         if comment_data is not None:
232 |             return Comment.from_data(
233 |                 comment_data["comment"]["commentRenderer"],
234 |                 post_id,
235 |                 channel_id,
236 |                 safely_get_value_from_key(
237 |                     comment_data,
238 |                     "replies",
239 |                     "commentRepliesRenderer",
240 |                     "contents",
241 |                     0,
242 |                     "continuationItemRenderer",
243 |                     "continuationEndpoint",
244 |                     "continuationCommand",
245 |                     "token"
246 |                 ),
247 |                 safely_get_value_from_key(
248 |                     comment_data,
249 |                     "replies",
250 |                     "commentRepliesRenderer",
251 |                     "contents",
252 |                     0,
253 |                     "continuationItemRenderer",
254 |                     "continuationEndpoint",
255 |                     "clickTrackingParams"
256 |                 ),
257 |                 None,
258 |                 None
259 |             )
260 | 
261 |     @staticmethod
262 |     def get_update_comment_params(comment_id, post_id, channel_id):
263 |         params = [
264 |             b"\n",
265 |             len(comment_id).to_bytes(1, "big"),
266 |             comment_id.encode(),
267 |             b"*\x02\b\x00@\x01R",
268 |             len(post_id).to_bytes(1, "big"),
269 |             post_id.encode(),
270 |             b"Z",
271 |             len(channel_id).to_bytes(1, "big"),
272 |             channel_id.encode()
273 |         ]
274 | 
275 |         params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D")
276 | 
277 |         return params
278 | 
279 |     def update_comment(self, comment_text):
280 |         return Comment._update_comment(comment_text, comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id)
281 | 
282 |     @staticmethod
283 |     def _update_comment(comment_text, update_comment_params=None, comment_id=None, post_id=None, channel_id=None):
284 |         if update_comment_params is None:
285 |             update_comment_params = Comment.get_update_comment_params(comment_id, post_id, channel_id)
286 | 
287 |         headers = {
288 |             "Accept-Language": "en-US,en;q=0.9",
289 |             "x-origin": "https://www.youtube.com"
290 |         }
291 | 
292 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
293 |         if "SAPISID" in current_cookies:
294 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
295 | 
296 |         json_body = {
297 |             "context": {
298 |                 "client": {
299 |                     "clientName": "WEB",
300 |                     "clientVersion": CLIENT_VERSION
301 |                 }
302 |             },
303 |             "updateCommentParams": update_comment_params,
304 |             "commentText": comment_text
305 |         }
306 | 
307 |         r = requests_cache.post(
308 |             Comment.FORMAT_URLS["UPDATE_COMMENT_ENDPOINT"],
309 |             json=json_body,
310 |             headers=headers
311 |         )
312 | 
313 |         return r.json()
314 | 
315 |     @staticmethod
316 |     def get_delete_comment_params(comment_id, post_id, channel_id):
317 |         params = [
318 |             b"\b\x06\x10\x07\x1A",
319 |             len(comment_id).to_bytes(1, "big"),
320 |             comment_id.encode(),
321 |             b"0\x00J\x15115587043600121621724P\x00\xA8\x01\x01\xB2\x01",
322 |             len(post_id).to_bytes(1, "big"),
323 |             post_id.encode(),
324 |             b"\xBA\x01",
325 |             len(channel_id).to_bytes(1, "big"),
326 |             channel_id.encode(),
327 |             b"\xF0\x01\x01"
328 |         ]
329 | 
330 |         params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D")
331 | 
332 |         return params
333 | 
334 |     def delete_comment(self):
335 |         return Comment._delete_comment(comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id)
336 | 
337 |     @staticmethod
338 |     def _delete_comment(delete_comment_params=None, comment_id=None, post_id=None, channel_id=None):
339 |         if delete_comment_params is None:
340 |             delete_comment_params = Comment.get_delete_comment_params(comment_id, post_id, channel_id)
341 | 
342 |         return Comment.perform_action(delete_comment_params)
343 | 
344 |     @staticmethod
345 |     def get_dislike_comment_params(value, comment_id, post_id, channel_id):
346 |         params = [
347 |             b"\b\x04\x10\x07\x1A",
348 |             len(comment_id).to_bytes(1, "big"),
349 |             comment_id.encode(),
350 |             b"0\x008",
351 |             (not value).to_bytes(1, "big"),
352 |             b"J\x15115587043600121621724P\x00\xA8\x01\x01\xB2\x01",
353 |             len(post_id).to_bytes(1, "big"),
354 |             post_id.encode(),
355 |             b"\xBA\x01",
356 |             len(channel_id).to_bytes(1, "big"),
357 |             channel_id.encode(),
358 |             b"\xF0\x01\x01"
359 |         ]
360 | 
361 |         params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D")
362 | 
363 |         return params
364 | 
365 |     def set_dislike_comment(self, value=True):
366 |         return Comment._set_dislike_comment(value, comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id)
367 | 
368 |     @staticmethod
369 |     def _set_dislike_comment(value, dislike_comment_params=None, comment_id=None, post_id=None, channel_id=None):
370 |         if dislike_comment_params is None:
371 |             dislike_comment_params = Comment.get_dislike_comment_params(value, comment_id, post_id, channel_id)
372 | 
373 |         return Comment.perform_action(dislike_comment_params)
374 | 
375 |     @staticmethod
376 |     def get_like_comment_params(value, comment_id, post_id, channel_id):
377 |         params = [
378 |             b"\b\x05\x10\x07\x1A",
379 |             len(comment_id).to_bytes(1, "big"),
380 |             comment_id.encode(),
381 |             b"0\x008",
382 |             (not value).to_bytes(1, "big"),
383 |             b"J\x15115587043600121621724P\x00\xA8\x01\x01\xB2\x01",
384 |             len(post_id).to_bytes(1, "big"),
385 |             post_id.encode(),
386 |             b"\xBA\x01",
387 |             len(channel_id).to_bytes(1, "big"),
388 |             channel_id.encode(),
389 |             b"\xF0\x01\x01"
390 |         ]
391 | 
392 |         params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D")
393 | 
394 |         return params
395 | 
396 |     def set_like_comment(self, value=True):
397 |         return Comment._set_like_comment(value, comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id)
398 | 
399 |     @staticmethod
400 |     def _set_like_comment(value, like_comment_params=None, comment_id=None, post_id=None, channel_id=None):
401 |         if like_comment_params is None:
402 |             like_comment_params = Comment.get_like_comment_params(value, comment_id, post_id, channel_id)
403 | 
404 |         return Comment.perform_action(like_comment_params)
405 | 
406 |     @staticmethod
407 |     def perform_action(action_params):
408 |         headers = {
409 |             "Accept-Language": "en-US,en;q=0.9",
410 |             "x-origin": "https://www.youtube.com"
411 |         }
412 | 
413 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
414 |         if "SAPISID" in current_cookies:
415 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
416 | 
417 |         json_body = {
418 |             "context": {
419 |                 "client": {
420 |                     "clientName": "WEB",
421 |                     "clientVersion": CLIENT_VERSION,
422 |                 },
423 |             },
424 |             "actions": [
425 |                 action_params
426 |             ]
427 |         }
428 | 
429 |         r = requests_cache.post(
430 |             Comment.FORMAT_URLS["PERFORM_COMMENT_ACTION_ENDPOINT"],
431 |             json=json_body,
432 |             headers=headers
433 |         )
434 | 
435 |         return r.json()
436 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/community_tab.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from requests.utils import dict_from_cookiejar
  4 | 
  5 | from .helpers.utils import safely_get_value_from_key, get_auth_header, CLIENT_VERSION, search_key
  6 | from .requests_handler import requests_cache
  7 | from .post import Post
  8 | 
  9 | 
 10 | class CommunityTab(object):
 11 |     FORMAT_URLS = {
 12 |         "COMMUNITY_TAB": "https://www.youtube.com/{}/{}/community",
 13 |         # HARD_CODED: This key seems to be constant to everyone, IDK
 14 |         "BROWSE_ENDPOINT": "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"
 15 |     }
 16 | 
 17 |     REGEX = {
 18 |         "YT_INITIAL_DATA": "ytInitialData = ({(?:(?:.|\n)*)?});</script>",
 19 |         "COMMUNITY_TAB_URL": "^\/.*\/community$"
 20 |     }
 21 | 
 22 |     def __init__(self, channel_name):
 23 |         self.channel_name = channel_name
 24 | 
 25 |         self.posts_continuation_token = None
 26 |         self.click_tracking_params = None
 27 |         self.visitor_data = None
 28 |         self.session_index = "0"
 29 |         self.posts = []
 30 |         self.community_url = None
 31 |         self.channel_id = None
 32 | 
 33 |     def load_posts(self, expire_after=0):
 34 |         headers = {
 35 |             "Accept-Language": "en-US,en;q=0.9",
 36 |             "Referer": self.community_url
 37 |         }
 38 | 
 39 |         # Add authorization header
 40 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
 41 |         if "SAPISID" in current_cookies:
 42 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
 43 | 
 44 |         if self.posts_continuation_token is None:
 45 |             try:
 46 |                 # Get posts from community tab enpoint
 47 |                 self.community_url = CommunityTab.FORMAT_URLS["COMMUNITY_TAB"].format("c", self.channel_name)
 48 |                 r = requests_cache.get(self.community_url, expire_after=expire_after, headers=headers)
 49 |                 if r.status_code != 200:
 50 |                     self.community_url = CommunityTab.FORMAT_URLS["COMMUNITY_TAB"].format("channel", self.channel_name)
 51 |                     r = requests_cache.get(self.community_url, expire_after=expire_after, headers=headers)
 52 | 
 53 |                 if r.status_code != 200:
 54 |                     import sys
 55 | 
 56 |                     print(f"[Can't get data from the channel_name: {self.channel_name}]")
 57 |                     sys.exit()
 58 | 
 59 |                 m = re.findall(CommunityTab.REGEX["YT_INITIAL_DATA"], r.text)
 60 |                 data = json.loads(m[0])
 61 | 
 62 |                 if self.channel_id is None:
 63 |                     self.channel_id = data["metadata"]["channelMetadataRenderer"]["externalId"]
 64 | 
 65 |             except IndexError as e:
 66 |                 print("[Can't find yt_initial_data using the regex]")
 67 |                 raise e
 68 |             except json.decoder.JSONDecodeError as e:
 69 |                 print("[Can't parse yt_initial_data from the regex]")
 70 |                 raise e
 71 |             except Exception as e:
 72 |                 print("[Some non-expected exception, probably caused by requests...]")
 73 |                 raise e
 74 | 
 75 |             tabs = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"]
 76 |             community_tab = CommunityTab.get_community_tab(tabs)
 77 |             community_tab_items = CommunityTab.get_items_from_community_tab(community_tab)
 78 | 
 79 |             self.click_tracking_params = CommunityTab.get_click_tracking_params_from_community_tab(community_tab)
 80 |             self.visitor_data = data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["visitorData"]
 81 |             self.session_index = str(
 82 |                 safely_get_value_from_key(data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"], "sessionIndex", default="")
 83 |             )
 84 |             self.append_posts_from_items(community_tab_items)
 85 |         elif self.posts_continuation_token is not False:
 86 |             headers.update(
 87 |                 {
 88 |                     "X-Goog-AuthUser": self.session_index,
 89 |                     "X-Origin": "https://www.youtube.com",
 90 |                     "X-Youtube-Client-Name": "1",
 91 |                     "X-Youtube-Client-Version": CLIENT_VERSION
 92 |                 }
 93 |             )
 94 | 
 95 |             json_body = {
 96 |                 "context": {
 97 |                     "client": {"clientName": "WEB", "clientVersion": CLIENT_VERSION, "originalUrl": self.community_url, "visitorData": self.visitor_data}
 98 |                 },
 99 |                 "continuation": self.posts_continuation_token,
100 |                 "clickTracking": {"clickTrackingParams": self.click_tracking_params}
101 |             }
102 | 
103 |             r = requests_cache.post(CommunityTab.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers)
104 | 
105 |             data = r.json()
106 |             append = data["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"]
107 |             self.click_tracking_params = data["onResponseReceivedEndpoints"][0]["clickTrackingParams"]
108 |             self.append_posts_from_items(safely_get_value_from_key(append, "continuationItems", default=[]))
109 | 
110 |     def append_posts_from_items(self, items):
111 |         there_is_no_continuation_token = True
112 |         for item in items:
113 |             kind = list(item.keys())[0]
114 | 
115 |             if kind == "backstagePostThreadRenderer":
116 |                 post_data = item["backstagePostThreadRenderer"]["post"]
117 |                 self.posts.append(Post.from_data(post_data))
118 |             elif kind == "continuationItemRenderer":
119 |                 self.posts_continuation_token = item[kind]["continuationEndpoint"]["continuationCommand"]["token"]
120 |                 there_is_no_continuation_token = False
121 | 
122 |         if there_is_no_continuation_token:
123 |             self.posts_continuation_token = False
124 | 
125 |     @staticmethod
126 |     def get_community_tab(tabs):
127 |         for tab in tabs:
128 |             if "tabRenderer" in tab and re.match(CommunityTab.REGEX["COMMUNITY_TAB_URL"], tab["tabRenderer"]["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"]):
129 |                 return tab
130 |         raise Exception(f"[Could not find a Community tab in the channel response]")
131 | 
132 |     @staticmethod
133 |     def get_items_from_community_tab(tab):
134 |         try:
135 |             return tab["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"]
136 |         except Exception as e:
137 |             print("[Can't get the contents from the tab]")
138 |             raise e
139 | 
140 |     @staticmethod
141 |     def get_click_tracking_params_from_community_tab(tab):
142 |         try:
143 |             return tab["tabRenderer"]["content"]["sectionListRenderer"]["trackingParams"]
144 |         except Exception as e:
145 |             print("[Can't get tracking params from the tab")
146 |             raise e
147 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/helpers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .utils import (
 2 |     safely_get_value_from_key,
 3 |     save_object_to_file,
 4 |     safely_pop_value_from_key,
 5 |     search_key,
 6 |     get_auth_header,
 7 |     CLIENT_VERSION
 8 | )
 9 | from .clean_items import (
10 |     clean_content_text,
11 |     clean_backstage_attachment
12 | )
13 | 
14 | __all__ = [
15 |     "safely_get_value_from_key",
16 |     "safely_pop_value_from_key",
17 |     "save_object_to_file",
18 |     "search_key",
19 |     "get_auth_header",
20 |     "clean_content_text",
21 |     "clean_backstage_attachement",
22 |     "CLIENT_VERSION"
23 | ]
24 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/helpers/clean_items.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import parse_qs, unquote, urlparse
 2 | from .utils import safely_get_value_from_key as safe
 3 | from .utils import safely_pop_value_from_key as safe_pop
 4 | 
 5 | 
 6 | # lots of returned objects are full of tracking params, client data, duplicate info, etc. this sorta trims the fat.
 7 | def clean_content_text(content):
 8 |     for item in safe(content, "runs", default=[]):
 9 |         if "navigationEndpoint" in item:
10 |             # traditional links
11 |             if "urlEndpoint" in item["navigationEndpoint"]:
12 |                 url = item["navigationEndpoint"]["urlEndpoint"]["url"]
13 |                 # replace redirects with direct links
14 |                 if url.startswith("https://www.youtube.com/redirect"):
15 |                     parsed_url = urlparse(url)
16 |                     redirect_url = parse_qs(parsed_url.query)["q"][0]
17 |                     url = unquote(redirect_url)
18 |                 item["urlEndpoint"] = {"url": url}
19 |                 item.pop("navigationEndpoint")
20 |             # hashtags
21 |             elif "browseEndpoint" in item["navigationEndpoint"]:
22 |                 item.pop("loggingDirectives")
23 |                 safe_pop(item, "navigationEndpoint", "browseEndpoint", "params")
24 |                 item["browseEndpoint"] = item["navigationEndpoint"]["browseEndpoint"]
25 |                 item["browseEndpoint"]["url"] = item["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
26 |                 item.pop("navigationEndpoint")
27 |     return content
28 | 
29 | def clean_backstage_attachment(attachment):
30 |     if attachment:
31 |         if "pollRenderer" in attachment:
32 |             for choice in attachment["pollRenderer"]["choices"]:
33 |                 for value in [
34 |                     "selectServiceEndpoint",
35 |                     "deselectServiceEndpoint",
36 |                     "voteRatioIfSelected",
37 |                     "votePercentageIfSelected",
38 |                     "voteRatioIfNotSelected",
39 |                     "votePercentageIfNotSelected"
40 |                 ]:
41 |                     safe_pop(choice, value)
42 |         elif "videoRenderer" in attachment:
43 |             safe_pop(attachment, "videoRenderer", "navigationEndpoint", "watchEndpoint", "watchEndpointSupportedOnesieConfig")
44 |             attachment["videoRenderer"]["watchEndpoint"] = safe(attachment, "videoRenderer", "navigationEndpoint", "watchEndpoint", default={})
45 |             attachment["videoRenderer"]["watchEndpoint"]["url"] = safe(
46 |                 attachment, "videoRenderer", "navigationEndpoint", "commandMetadata", "webCommandMetadata", "url"
47 |             )
48 | 
49 |             for long_by_line in safe(attachment, "videoRenderer", "longBylineText", "runs", default=[]):
50 |                 long_by_line["browseEndpoint"] = long_by_line["navigationEndpoint"]["browseEndpoint"]
51 |                 long_by_line.pop("navigationEndpoint")
52 | 
53 |             for short_by_line in safe(attachment, "videoRenderer", "shortBylineText", "runs", default=[]):
54 |                 short_by_line["browseEndpoint"] = short_by_line["navigationEndpoint"]["browseEndpoint"]
55 |                 short_by_line.pop("navigationEndpoint")
56 | 
57 |             for author in safe(attachment, "videoRenderer", "ownerText", "runs", default=[]):
58 |                 author["browseEndpoint"] = author["navigationEndpoint"]["browseEndpoint"]
59 | 
60 |             for value in [
61 |                 "publishedTimeText",
62 |                 "navigationEndpoint",
63 |                 "trackingParams",
64 |                 "showActionMenu",
65 |                 "menu",
66 |                 "channelThumbnailSupportedRenderers",
67 |                 "thumbnailOverlays"
68 |             ]:
69 |                 safe_pop(attachment, "videoRenderer", value)
70 |         elif "backstageImageRenderer" in attachment:
71 |             safe_pop(attachment, "backstageImageRenderer", "trackingParams")
72 |         elif "postMultiImageRenderer" in attachment:
73 |             for image in attachment["postMultiImageRenderer"]["images"]:
74 |                 safe_pop(image, "backstageImageRenderer", "trackingParams")
75 |         return attachment
76 |     return None
77 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/helpers/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from hashlib import sha1
 4 | 
 5 | CLIENT_VERSION = "2.20220311.01.00"
 6 | 
 7 | def safely_get_value_from_key(*args, default=None):
 8 |     obj = args[0]
 9 |     keys = args[1:]
10 | 
11 |     for key in keys:
12 |         try:
13 |             obj = obj[key]
14 |         except Exception:
15 |             return default
16 | 
17 |     return obj
18 | 
19 | 
20 | def safely_pop_value_from_key(*args):
21 |     obj = args[0]
22 |     keys = args[1:-1]
23 | 
24 |     for key in keys:
25 |         try:
26 |             obj = obj[key]
27 |         except Exception:
28 |             return None
29 | 
30 |     pop_key = args[-1]
31 | 
32 |     if pop_key in obj:
33 |         obj.pop(pop_key)
34 | 
35 | 
36 | def search_key(key, data, current_key=[]):
37 |     found = []
38 | 
39 |     if type(data).__name__ == "dict":
40 |         keys = list(data.keys())
41 |     elif type(data).__name__ == "list":
42 |         keys = list(range(len(data)))
43 |     else:
44 |         return []
45 | 
46 |     if key in keys:
47 |         found.append((current_key + [key], data[key]))
48 |         keys.remove(key)
49 | 
50 |     for k in keys:
51 |         found += search_key(key, data[k], current_key=current_key + [k])
52 | 
53 |     return found
54 | 
55 | 
56 | def save_object_to_file(obj, path):
57 |     with open(path, "w") as f:
58 |         f.write(json.dumps(obj, indent=4))
59 | 
60 | 
61 | def get_auth_header(sapisid):
62 |     timestring = str(int(time.time()))
63 |     return f"SAPISIDHASH {timestring}_" + sha1(" ".join([timestring, sapisid, "https://www.youtube.com"]).encode()).hexdigest()
64 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/post.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from requests.utils import dict_from_cookiejar
  4 | from base64 import urlsafe_b64encode
  5 | 
  6 | from .helpers.clean_items import clean_content_text, clean_backstage_attachment
  7 | from .helpers.utils import safely_get_value_from_key, search_key, get_auth_header, CLIENT_VERSION
  8 | from .requests_handler import requests_cache
  9 | from .comment import Comment
 10 | 
 11 | 
 12 | class Post(object):
 13 |     FORMAT_URLS = {
 14 |         "POST": "https://www.youtube.com/post/{}",
 15 |         # HARD_CODED: This key seems to be constant to everyone, IDK
 16 |         "BROWSE_ENDPOINT": "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
 17 |         "CREATE_COMMENT_ENDPOINT": "https://www.youtube.com/youtubei/v1/comment/create_comment?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false"
 18 |     }
 19 | 
 20 |     REGEX = {
 21 |         "YT_INITIAL_DATA": "ytInitialData = ({(?:(?:.|\n)*)?});</script>"
 22 |     }
 23 | 
 24 |     def __init__(self, post_id, channel_id, author=None, content_text=None, backstage_attachment=None, vote_count=None, sponsor_only_badge=None, published_time_text=None, original_post=None):
 25 |         self.post_id = post_id
 26 |         self.channel_id = channel_id
 27 |         self.author = author
 28 |         self.content_text = content_text
 29 |         self.backstage_attachment = backstage_attachment
 30 |         self.vote_count = vote_count
 31 |         self.sponsor_only_badge = sponsor_only_badge
 32 |         self.published_time_text = published_time_text
 33 |         self.original_post = original_post
 34 | 
 35 |         self.first = True
 36 |         self.comments = []
 37 |         self.comments_continuation_token = None
 38 |         self.click_tracking_params = None
 39 |         self.visitor_data = None
 40 |         self.session_index = "0"
 41 | 
 42 |     def as_json(self):
 43 |         return {
 44 |             "post_id": self.post_id,
 45 |             "channel_id": self.channel_id,
 46 |             "author": self.author,
 47 |             "content_text": self.content_text,
 48 |             "backstage_attachment": self.backstage_attachment,
 49 |             "vote_count": self.vote_count,
 50 |             "sponsor_only_badge": self.sponsor_only_badge,
 51 |             "original_post": self.original_post and self.original_post.as_json()
 52 |         }
 53 | 
 54 |     def get_published_string(self):
 55 |         return self.published_time_text
 56 | 
 57 |     @staticmethod
 58 |     def from_post_id(post_id, expire_after=0):
 59 |         headers = {
 60 |             "Accept-Language": "en-US,en;q=0.9",
 61 |             "Referer": Post.FORMAT_URLS["POST"].format(post_id)
 62 |         }
 63 | 
 64 |         # Add authorization header
 65 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
 66 |         if "SAPISID" in current_cookies:
 67 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
 68 | 
 69 |         post_url = Post.FORMAT_URLS["POST"].format(post_id)
 70 |         r = requests_cache.get(post_url, expire_after=expire_after, headers=headers)
 71 | 
 72 |         m = re.findall(Post.REGEX["YT_INITIAL_DATA"], r.text)
 73 |         data = json.loads(m[0])
 74 |         community_tab = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0]
 75 |         community_tab_items = Post.get_items_from_community_tab(community_tab)
 76 | 
 77 |         post_data = community_tab_items[0]["backstagePostThreadRenderer"]["post"]
 78 | 
 79 |         post = Post.from_data(post_data)
 80 |         post.get_first_continuation_token(data)
 81 |         post.get_click_tracking_params(data)
 82 |         post.visitor_data = data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["visitorData"]
 83 |         post.session_index = str(
 84 |             safely_get_value_from_key(data, "responseContext", "webResponseContextExtensionData", "ytConfigData", "sessionIndex", default="")
 85 |         )
 86 | 
 87 |         return post
 88 | 
 89 |     def __str__(self):
 90 |         return json.dumps(self.as_json(), indent=4)
 91 | 
 92 |     def __repr__(self):
 93 |         return self.__str__()
 94 | 
 95 |     def get_thumbnails(self):
 96 |         # Returns a list of the thumbnails in different resolutions of
 97 |         # all images present in the post
 98 |         thumbnails = []
 99 | 
100 |         if self.backstage_attachment is not None:
101 |             renderer_key = list(self.backstage_attachment.keys())[0]
102 | 
103 |             if renderer_key == "videoRenderer":
104 |                 thumbnails = [self.backstage_attachment[renderer_key]["thumbnail"]["thumbnails"]]
105 |             elif renderer_key == "backstageImageRenderer":
106 |                 thumbnails = [self.backstage_attachment[renderer_key]["image"]["thumbnails"]]
107 |             elif renderer_key == "postMultiImageRenderer":
108 |                 thumbnails = [img["backstageImageRenderer"]["image"]["thumbnails"] for img in self.backstage_attachment[renderer_key]["images"]]
109 |             elif renderer_key == "pollRenderer":
110 |                 print("[There is nothing implemented for polls]")
111 |                 thumbnails = []
112 |             else:
113 |                 raise Exception("There is no implementation for renderer_key={renderer_key} yet")
114 | 
115 |         return thumbnails
116 | 
117 |     def get_first_continuation_token(self, data):
118 |         self.comments_continuation_token = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0]["tabRenderer"]["content"]["sectionListRenderer"][
119 |             "contents"][1]["itemSectionRenderer"]["contents"][0]["continuationItemRenderer"]["continuationEndpoint"]["continuationCommand"]["token"]
120 | 
121 |     def get_click_tracking_params(self, data):
122 |         self.click_tracking_params = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0]["tabRenderer"]["content"]["sectionListRenderer"][
123 |             "contents"][1]["itemSectionRenderer"]["contents"][0]["continuationItemRenderer"]["continuationEndpoint"]["clickTrackingParams"]
124 | 
125 |     def get_click_tracking_params(self, data):
126 |         self.click_tracking_params = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][
127 |             1
128 |         ]["itemSectionRenderer"]["contents"][0]["continuationItemRenderer"]["continuationEndpoint"]["clickTrackingParams"]
129 | 
130 |     def load_comments(self, expire_after=0):
131 |         headers = {
132 |             "Accept-Language": "en-US,en;q=0.9",
133 |             "Referer": Post.FORMAT_URLS["POST"].format(self.post_id)
134 |         }
135 | 
136 |         # Add authorization header
137 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
138 |         if "SAPISID" in current_cookies:
139 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
140 | 
141 |         if self.comments_continuation_token is None:
142 |             try:
143 |                 r = requests_cache.get(Post.FORMAT_URLS["POST"].format(self.post_id), expire_after=expire_after, headers=headers)
144 | 
145 |                 m = re.findall(Post.REGEX["YT_INITIAL_DATA"], r.text)
146 |                 data = json.loads(m[0])
147 | 
148 |                 self.get_first_continuation_token(data)
149 |                 self.get_click_tracking_params(data)
150 |                 self.visitor_data = data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["visitorData"]
151 |                 self.session_index = str(data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["sessionIndex"])
152 |                 self.load_comments(expire_after=expire_after)
153 |             except Exception as e:
154 |                 print("[Some non-expected exception, probably caused by requests...]")
155 |                 raise e
156 |         elif self.comments_continuation_token is not False:
157 |             headers.update(
158 |                 {
159 |                     "X-Goog-AuthUser": self.session_index,
160 |                     "X-Origin": "https://www.youtube.com",
161 |                     "X-Youtube-Client-Name": "1",
162 |                     "X-Youtube-Client-Version": CLIENT_VERSION
163 |                 }
164 |             )
165 | 
166 |             json_body = {
167 |                 "context": {
168 |                     "client": {
169 |                         "clientName": "WEB",
170 |                         "clientVersion": CLIENT_VERSION,
171 |                         "originalUrl": Post.FORMAT_URLS["POST"].format(self.post_id),
172 |                         "visitorData": self.visitor_data
173 |                     }
174 |                 },
175 |                 "continuation": self.comments_continuation_token,
176 |                 "clickTracking": {"clickTrackingParams": self.click_tracking_params}
177 |             }
178 | 
179 |             r = requests_cache.post(Post.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers)
180 | 
181 |             data = r.json()
182 |             if self.first:
183 |                 if "continuationItems" not in data["onResponseReceivedEndpoints"][1]["reloadContinuationItemsCommand"]:
184 |                     # There are no comments
185 |                     continuation_items = []
186 |                 else:
187 |                     append = data["onResponseReceivedEndpoints"][1]["reloadContinuationItemsCommand"]
188 |                     continuation_items = safely_get_value_from_key(append, "continuationItems", default=[])
189 |                     self.first = False
190 |             else:
191 |                 append = data["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"]
192 |                 continuation_items = safely_get_value_from_key(append, "continuationItems", default=[])
193 | 
194 |             self.click_tracking_params = data["trackingParams"]
195 |             self.append_comments_from_items(continuation_items)
196 | 
197 |     def append_comments_from_items(self, items):
198 |         there_is_no_continuation_token = True
199 |         for item in items:
200 |             kind = list(item.keys())[0]
201 | 
202 |             if kind == "commentThreadRenderer":
203 |                 self.comments.append(
204 |                     Comment.from_data(
205 |                         item[kind]["comment"]["commentRenderer"],
206 |                         self.post_id,
207 |                         self.channel_id,
208 |                         safely_get_value_from_key(
209 |                             item[kind],
210 |                             "replies",
211 |                             "commentRepliesRenderer",
212 |                             "contents",
213 |                             0,
214 |                             "continuationItemRenderer",
215 |                             "continuationEndpoint",
216 |                             "continuationCommand",
217 |                             "token"
218 |                         ),
219 |                         safely_get_value_from_key(
220 |                             item[kind],
221 |                             "replies",
222 |                             "commentRepliesRenderer",
223 |                             "contents",
224 |                             0,
225 |                             "continuationItemRenderer",
226 |                             "continuationEndpoint",
227 |                             "clickTrackingParams"
228 |                         ),
229 |                         self.visitor_data,
230 |                         self.session_index
231 |                     )
232 |                 )
233 |             elif kind == "continuationItemRenderer":
234 |                 self.comments_continuation_token = item[kind]["continuationEndpoint"]["continuationCommand"]["token"]
235 |                 there_is_no_continuation_token = False
236 | 
237 |         if there_is_no_continuation_token:
238 |             self.comments_continuation_token = False
239 | 
240 |     def get_text(self):
241 |         runs = safely_get_value_from_key(self.content_text, "runs", default=[])
242 | 
243 |         if self.content_text is not None:
244 |             return "\n".join([run["text"] for run in runs])
245 |         return None
246 | 
247 |     def get_create_comment_params(self):
248 |         if self.channel_id is None or self.post_id is None:
249 |             return None
250 | 
251 |         params = [
252 |             b"*\x02\b\x00P\x01\xA2\x01",
253 |             len(self.post_id).to_bytes(1, "big"),
254 |             self.post_id.encode(),
255 |             b"\xAA\x01",
256 |             len(self.channel_id).to_bytes(1, "big"),
257 |             self.channel_id.encode(),
258 |         ]
259 | 
260 |         params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D")
261 | 
262 |         return params
263 | 
264 |     def create_comment(self, comment_text):
265 |         headers = {
266 |             "Accept-Language": "en-US,en;q=0.9",
267 |             "x-origin": "https://www.youtube.com"
268 |         }
269 | 
270 |         current_cookies = dict_from_cookiejar(requests_cache.cookies)
271 |         if "SAPISID" in current_cookies:
272 |             headers["Authorization"] = get_auth_header(current_cookies["SAPISID"])
273 | 
274 |         json_body = {
275 |             "context": {
276 |                 "client": {
277 |                     "clientName": "WEB",
278 |                     "clientVersion": CLIENT_VERSION,
279 |                 },
280 |             },
281 |             "createCommentParams": self.get_create_comment_params(),
282 |             "commentText": comment_text
283 |         }
284 | 
285 |         r = requests_cache.post(
286 |             Post.FORMAT_URLS["CREATE_COMMENT_ENDPOINT"],
287 |             json=json_body,
288 |             headers=headers
289 |         )
290 | 
291 |         try:
292 |             data = r.json()
293 |             comment_id = search_key("comment", data)[0][1]["commentRenderer"]["commentId"]
294 | 
295 |             return Comment.from_ids(comment_id, self.post_id, self.channel_id)
296 |         except Exception as e:
297 |             raise e
298 | 
299 |     @staticmethod
300 |     def from_data(post_data):
301 |         if "sharedPostRenderer" in post_data:
302 |             data = post_data["sharedPostRenderer"]
303 |             data["contentText"] = data.pop("content")
304 |             data["authorText"] = data.pop("displayName")
305 |             data["authorEndpoint"] = data.pop("endpoint")
306 | 
307 |             original_post_data = post_data["sharedPostRenderer"]["originalPost"]
308 |             data["originalPost"] = Post.from_data(original_post_data)
309 | 
310 |         elif "backstagePostRenderer" in post_data:
311 |             data = post_data["backstagePostRenderer"]
312 |         else:
313 |             raise NotImplementedError(f"[post_kind={list(post_data.keys())[0]} is not implemented yet!]")
314 | 
315 |         data["channelId"] = data["authorEndpoint"]["browseEndpoint"]["browseId"]
316 | 
317 |         # clean the author cause it's different here for some reason
318 |         for item in data["authorText"]["runs"]:
319 |             item["browseEndpoint"] = item["navigationEndpoint"]["browseEndpoint"]
320 |             item["browseEndpoint"]["url"] = item["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
321 |             item.pop("navigationEndpoint")
322 |         data["authorEndpoint"]["browseId"] = data["authorEndpoint"]["browseEndpoint"]["browseId"]
323 |         author_url = data["authorEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"]
324 |         data["authorEndpoint"]["url"] = author_url
325 |         for value in ["clickTrackingParams", "commandMetadata", "browseEndpoint"]:
326 |             data["authorEndpoint"].pop(value)
327 | 
328 |         post = Post(
329 |             data["postId"],
330 |             channel_id=data["channelId"],
331 |             author={
332 |                 "authorText": safely_get_value_from_key(data, "authorText"),
333 |                 "authorThumbnail": safely_get_value_from_key(data, "authorThumbnail"),
334 |                 "authorEndpoint": safely_get_value_from_key(data, "authorEndpoint")
335 |             },
336 |             content_text=clean_content_text(safely_get_value_from_key(data, "contentText")),
337 |             backstage_attachment=clean_backstage_attachment(safely_get_value_from_key(data, "backstageAttachment", default=None)),
338 |             vote_count=safely_get_value_from_key(data, "voteCount"),
339 |             sponsor_only_badge=safely_get_value_from_key(data, "sponsorsOnlyBadge", default=None),
340 |             published_time_text=safely_get_value_from_key(data, "publishedTimeText", "runs", 0, "text", default=None),
341 |             original_post=safely_get_value_from_key(data, "originalPost", default=None)
342 |         )
343 | 
344 |         post.raw_data = data
345 | 
346 |         return post
347 | 
348 |     @staticmethod
349 |     def get_items_from_community_tab(tab):
350 |         try:
351 |             return tab["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"]
352 |         except Exception as e:
353 |             print("[Can't get the contents from the tab]")
354 |             raise e
355 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/reply.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from .helpers.utils import safely_get_value_from_key
 4 | 
 5 | 
 6 | class Reply(object):
 7 |     def __init__(self, reply_id, author=None, content_text=None, vote_count=None):
 8 |         self.reply_id = reply_id
 9 |         self.author = author
10 |         self.content_text = content_text
11 |         self.vote_count = vote_count
12 | 
13 |     def as_json(self):
14 |         return {"reply_id": self.reply_id, "author": self.author, "content_text": self.content_text, "vote_count": self.vote_count}
15 | 
16 |     def __str__(self):
17 |         return json.dumps(self.as_json(), indent=4)
18 | 
19 |     def __repr__(self):
20 |         return self.__str__()
21 | 
22 |     def get_text(self):
23 |         if self.content_text is not None:
24 |             return "".join([run["text"] for run in self.content_text["runs"]])
25 |         return None
26 | 
27 |     @staticmethod
28 |     def from_data(data):
29 |         reply = Reply(
30 |             data["commentId"],
31 |             content_text=safely_get_value_from_key(data, "contentText"),
32 |             author={
33 |                 "authorText": safely_get_value_from_key(data, "authorText"),
34 |                 "authorThumbnail": safely_get_value_from_key(data, "authorThumbnail"),
35 |                 "authorEndpoint": safely_get_value_from_key(data, "authorEndpoint", "browseEndpoint"),
36 |                 "authorIsChannelOwner": safely_get_value_from_key(data, "authorIsChannelOwner"),
37 |                 "sponsorCommentBadge": safely_get_value_from_key(data, "sponsorCommentBadge")
38 |             },
39 |             vote_count=safely_get_value_from_key(data, "voteCount")
40 |         )
41 | 
42 |         reply.raw_data = data
43 | 
44 |         return reply
45 | 


--------------------------------------------------------------------------------
/youtube-community-tab/src/youtube_community_tab/requests_handler.py:
--------------------------------------------------------------------------------
1 | import os
2 | from requests_cache import CachedSession
3 | 
4 | dirname = os.path.dirname(__file__)
5 | CACHE_FILE_PATH = os.path.join(dirname, "requests_cache.sqlite")
6 | 
7 | requests_cache = CachedSession(allowable_methods=("GET", "POST"), cache_name=CACHE_FILE_PATH)
8 | 


--------------------------------------------------------------------------------
/youtube-community-tab/tests/test_actions.py:
--------------------------------------------------------------------------------
 1 | from http import cookiejar
 2 | from youtube_community_tab.requests_handler import requests_cache
 3 | from youtube_community_tab.helpers import search_key
 4 | from youtube_community_tab import Post
 5 | import time
 6 | 
 7 | EXPIRATION_TIME = 24 * 60 * 60  # requests cache expiration
 8 | 
 9 | cookie_jar = cookiejar.MozillaCookieJar("./cookies.txt")
10 | cookie_jar.load()
11 | requests_cache.cookies = cookie_jar
12 | 
13 | 
14 | def test_actions():
15 |     post = Post.from_post_id("UgkxpAbrgRG3trNwPVu9ipY7vALkJ_Q-c1lv")
16 |     comment = post.create_comment(f"[Current timestamp: {time.time()}]")
17 | 
18 |     assert comment is not None
19 | 
20 |     r = comment.set_like_comment()
21 |     s = search_key("status", r)
22 | 
23 |     assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED"
24 | 
25 |     r = comment.update_comment(f"[Edited][Current timestamp: {time.time()}]")
26 |     s = search_key("status", r)
27 | 
28 |     assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED"
29 | 
30 |     r = comment.set_dislike_comment()
31 |     s = search_key("status", r)
32 | 
33 |     assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED"
34 | 
35 |     r = comment.delete_comment()
36 |     s = search_key("status", r)
37 | 
38 |     assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED"
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     test_actions()
43 | 


--------------------------------------------------------------------------------
/youtube-community-tab/tests/test_community_tab.py:
--------------------------------------------------------------------------------
 1 | from youtube_community_tab.community_tab import CommunityTab
 2 | 
 3 | EXPIRATION_TIME = 24 * 60 * 60  # requests cache expiration
 4 | 
 5 | 
 6 | def test_community_tab():
 7 |     ct = CommunityTab("vsauce1")
 8 |     ct.load_posts(expire_after=EXPIRATION_TIME)
 9 | 
10 |     num_posts = len(ct.posts)
11 | 
12 |     assert num_posts > 0
13 |     assert ct.posts_continuation_token
14 | 
15 |     ct.load_posts(expire_after=EXPIRATION_TIME)
16 |     num_posts_ = len(ct.posts)
17 | 
18 |     assert num_posts_ > num_posts
19 | 
20 |     post = ct.posts[-1]  # Choose old post to raise probability of 'good' data
21 |     post.load_comments(expire_after=EXPIRATION_TIME)
22 | 
23 |     num_comments = len(post.comments)
24 | 
25 |     assert num_comments > 0
26 |     assert post.comments_continuation_token
27 | 
28 |     post.load_comments(expire_after=EXPIRATION_TIME)
29 | 
30 |     num_comments_ = len(post.comments)
31 | 
32 |     assert num_comments_ > num_comments
33 | 
34 |     replied_comments = list(filter(lambda x: x.replies_continuation_token, post.comments))
35 | 
36 |     if len(replied_comments) > 0:
37 |         comment = replied_comments[0]
38 | 
39 |         comment.load_replies(expire_after=EXPIRATION_TIME)
40 | 
41 |         assert len(comment.replies) > 0
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     test_community_tab()
46 | 


--------------------------------------------------------------------------------
/youtube-community-tab/tests/test_membership.py:
--------------------------------------------------------------------------------
 1 | from http import cookiejar
 2 | from youtube_community_tab.requests_handler import requests_cache
 3 | from youtube_community_tab.community_tab import CommunityTab
 4 | from youtube_community_tab import Post
 5 | 
 6 | EXPIRATION_TIME = 24 * 60 * 60  # requests cache expiration
 7 | 
 8 | cookie_jar = cookiejar.MozillaCookieJar("./cookies.txt")
 9 | cookie_jar.load()
10 | requests_cache.cookies = cookie_jar
11 | 
12 | 
13 | def test_load_membership_posts():
14 |     ct = CommunityTab("UCMwGHR0BTZuLsmjY_NT5Pwg")
15 |     ct.load_posts(expire_after=EXPIRATION_TIME)
16 | 
17 |     membership_post = None
18 |     while ct.posts_continuation_token:
19 |         for post in ct.posts:
20 |             if post.sponsor_only_badge is not None:
21 |                 membership_post = post
22 |                 break
23 | 
24 |         if(membership_post is not None):
25 |             break
26 | 
27 |         ct.load_posts(expire_after=EXPIRATION_TIME)
28 | 
29 |     assert(membership_post is not None)
30 | 
31 | 
32 | def test_membership_post():
33 |     post = Post.from_post_id("UgkxJYrBY-QqIt1ysrZY0ZP84SGJLWmDmtoU", expire_after=EXPIRATION_TIME)
34 | 
35 |     # This post can be edited, so this test can fail in the future
36 |     post_text = post.get_text()
37 | 
38 |     expected_text = "Cheeeeeeeeeeeeeeeese\nAm I bored? I don't know.... nyeh 😺"
39 | 
40 |     assert post_text == expected_text
41 | 
42 |     post.load_comments(expire_after=EXPIRATION_TIME)
43 |     num_comments = len(post.comments)
44 | 
45 |     assert num_comments > 0
46 |     assert post.comments_continuation_token
47 | 
48 |     post.load_comments(expire_after=EXPIRATION_TIME)
49 |     num_comments_ = len(post.comments)
50 | 
51 |     assert num_comments_ > num_comments
52 | 
53 |     replied_comments = list(filter(lambda x: x.replies_continuation_token, post.comments))
54 | 
55 |     if len(replied_comments) > 0:
56 |         comment = replied_comments[0]
57 | 
58 |         comment.load_replies(expire_after=EXPIRATION_TIME)
59 | 
60 |         assert len(comment.replies) > 0
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     test_load_membership_posts()
65 |     test_membership_post()
66 | 


--------------------------------------------------------------------------------
/youtube-community-tab/tests/test_post.py:
--------------------------------------------------------------------------------
 1 | from youtube_community_tab.post import Post
 2 | 
 3 | EXPIRATION_TIME = 24 * 60 * 60  # requests cache expiration
 4 | 
 5 | 
 6 | def test_post():
 7 |     post = Post.from_post_id("UgznJEQUR0fJzoMlS2Z4AaABCQ", expire_after=EXPIRATION_TIME)
 8 | 
 9 |     # This post can be edited, so this test can fail in the future
10 |     post_text = post.get_text()
11 |     expected_text = "Vsauce is 11 years old today!!!!"
12 | 
13 |     assert post_text == expected_text
14 | 
15 |     post.load_comments(expire_after=EXPIRATION_TIME)
16 |     num_comments = len(post.comments)
17 | 
18 |     assert num_comments > 0
19 |     assert post.comments_continuation_token
20 | 
21 |     post.load_comments(expire_after=EXPIRATION_TIME)
22 |     num_comments_ = len(post.comments)
23 | 
24 |     assert num_comments_ > num_comments
25 | 
26 |     replied_comments = list(filter(lambda x: x.replies_continuation_token, post.comments))
27 | 
28 |     if len(replied_comments) > 0:
29 |         comment = replied_comments[0]
30 | 
31 |         comment.load_replies(expire_after=EXPIRATION_TIME)
32 | 
33 |         assert len(comment.replies) > 0
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     test_post()
38 | 


--------------------------------------------------------------------------------
/ytct.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from datetime import datetime
  4 | from http import cookiejar
  5 | import json
  6 | import os
  7 | import re
  8 | import requests
  9 | import sys
 10 | import urllib.parse as urlparse
 11 | from youtube_community_tab.requests_handler import requests_cache
 12 | from youtube_community_tab.post import Post
 13 | from youtube_community_tab.community_tab import CommunityTab
 14 | 
 15 | POST_REGEX=r"^(?:(?:https?:\/\/)?(?:.*?\.)?(?:youtube\.com\/)((?:channel\/UC[a-zA-Z0-9_-]+\/community\?lb=)|post\/))?(?P<post_id>Ug[a-zA-Z0-9_-]+)(.*)?$"
 16 | CHANNEL_REGEX=r"^(?:(?:https?:\/\/)?(?:.*?\.)?(?:youtube\.com\/))((?P<channel_handle>@[a-zA-Z0-9_-]+)|((channel\/)?(?P<channel_id>UC[a-zA-Z0-9_-]+)))(?:\/.*)?$"
 17 | HANDLE_TO_ID_REGEX=r"\"header\":\{\"c4TabbedHeaderRenderer\":\{\"channelId\":\"(?P<channel_id>UC[a-zA-Z0-9_-]+)\""
 18 | POST_DATE_REGEX=r"(?P<magnitude>[0-9]{1,2}) (?P<unit>(second|minute|hour|day|week|month|year))s? ago(?P<edited> \(edited\))?$"
 19 | CLEAN_FILENAME_KINDA=r"[^\w\-_\. \[\]\(\)]"
 20 | BLOCK_SIZE = 1024
 21 | TIME_FACTORS={
 22 |     "second": 1,
 23 |     "minute": 60,
 24 |     "hour": 60 * 60,
 25 |     "day": 60 * 60 * 24,
 26 |     "week": 60 * 60 * 24 * 7, # beyond 28 days it becomes 1 month ago
 27 |     "year": 60 * 60 * 24 * 365
 28 | }
 29 | 
 30 | args = None
 31 | 
 32 | def get_arguments():
 33 |     parser.add_argument("--cookies", metavar="COOKIES FILE", type=str, help="path to a Netscape format cookies file where cookies will be read from/written to")
 34 |     parser.add_argument("-d", "--directory", type=str, help="save directory (defaults to current)", default=os.getcwd())
 35 |     parser.add_argument("--post-archive", metavar="FILE", type=str, help="download only posts not listed in the archive file and record the IDs of newly downloaded posts")
 36 |     parser.add_argument("--dates", action="store_true", help="write information about the post publish date")
 37 |     parser.add_argument("-r", "--reverse", action="store_true", help="download posts from oldest to newest")
 38 |     parser.add_argument("links", metavar="CHANNEL", nargs="*", help="youtube channel or community post link/id")
 39 |     parser.add_argument("--skip-download", action="store_true", help="skip downloading posts, intended for writing log")
 40 |     return parser.parse_args()
 41 | 
 42 | def use_default_cookies():
 43 |     requests_cache.cookies.set(
 44 |         'SOCS',
 45 |         'CAESNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjIwNzA1LjE2X3AwGgJwdCACGgYIgOedlgY',
 46 |         domain='.youtube.com',
 47 |         path='/'
 48 |     )
 49 |     requests_cache.cookies.set(
 50 |         'CONSENT',
 51 |         'PENDING+917',
 52 |         domain='.youtube.com',
 53 |         path='/'
 54 |     )
 55 | 
 56 | def use_cookies(cookie_jar_path):
 57 |     cookie_jar = cookiejar.MozillaCookieJar(cookie_jar_path)
 58 |     try:
 59 |         cookie_jar.load()
 60 |         print_log("ytct", f"loaded cookies from {cookie_jar_path}")
 61 |     except FileNotFoundError:
 62 |         use_default_cookies()
 63 |         print_log("ytct", f"could not find cookies file {cookie_jar_path}, continuing without cookies...")
 64 |         return
 65 |     except (cookiejar.LoadError, OSError) as e:
 66 |         use_default_cookies()
 67 |         print_log("ytct", f"{e}")
 68 |         print_log("ytct", f"failed to load cookies from {cookie_jar_path}, continuing without cookies")
 69 |         return
 70 |     requests_cache.cookies = cookie_jar
 71 | 
 72 | def get_channel_id_from_handle(channel_handle):
 73 |     handle_url = f"https://youtube.com/{channel_handle}"
 74 |     channel_home_r = requests_cache.get(handle_url)
 75 |     if not channel_home_r.ok:
 76 |         print_log("ytct", f"failed to convert channel handle to channel id, no response from {handle_url}")
 77 |         sys.exit(1)
 78 |     channel_home = channel_home_r.text
 79 |     channel_id_m = re.search(HANDLE_TO_ID_REGEX, channel_home)
 80 |     channel_id = channel_id_m.group("channel_id")
 81 |     if not channel_id:
 82 |         print_log("ytct", f"failed to convert channel handle to channel id, data format may have changed")
 83 |         sys.exit(1)
 84 |     return channel_id
 85 | 
 86 | def get_post(post_id, post_archive):
 87 |     if post_archive:
 88 |         with open(post_archive, "r") as archive_file:
 89 |             skip_ids = archive_file.read().splitlines()
 90 |         if post_id in skip_ids:
 91 |             print_log(f"post:{post_id}", f"already recorded in archive")
 92 |             return
 93 |     post = Post.from_post_id(post_id)
 94 |     handle_post(post)
 95 |     if post_archive:
 96 |         with open(post_archive, "a") as archive_file:
 97 |             archive_file.write(f"{post_id}\n")
 98 | 
 99 | def get_channel_posts(channel_id, post_archive):
100 |     ct = CommunityTab(channel_id)
101 |     page_count = 1
102 |     print_log("community tab", f"getting posts from community tab (page {page_count})", "\r")
103 |     ct.load_posts(0)
104 |     while(ct.posts_continuation_token):
105 |         page_count += 1
106 |         print_log("community tab", f"getting posts from community tab (page {page_count})", "\r")
107 |         ct.load_posts(0)
108 |     print_log("community tab", f"getting posts from community tab (page {page_count})")
109 |     print_log("community tab", f"found {len(ct.posts)} posts")
110 |     # only read the archive once
111 |     skip_ids = []
112 |     if post_archive:
113 |         with open(post_archive, "r") as archive_file:
114 |             skip_ids = archive_file.read().splitlines()
115 |     if args.reverse:
116 |         ct.posts = reversed(ct.posts)
117 |     for post in ct.posts:
118 |         if len(skip_ids) > 0 and post.post_id in skip_ids:
119 |             print_log(f"post:{post.post_id}", f"already recorded in archive")
120 |             continue
121 |         if not args.skip_download:    
122 |             handle_post(post)
123 |         if post_archive:
124 |             with open(post_archive, "a") as archive_file:
125 |                 archive_file.write(f"{post.post_id}\n")
126 | 
127 | def handle_post(post):
128 |     post_j = post.as_json()
129 |     if post.original_post is not None:
130 |         if args.dates:
131 |             post_j["original_post"]["_published"] = get_timestamp_metadata(post.original_post)
132 |         handle_post(post.original_post)
133 |     component = f"post:{post.post_id}"
134 |     post_file_name = f"{post.post_id}"
135 |     post_file_dir = os.path.join(args.directory)
136 |     post_file_path = os.path.join(post_file_dir, post_file_name)
137 |     if args.dates:
138 |         timestamp_info = get_timestamp_metadata(post)
139 |         post_j["_published"] = timestamp_info
140 |     try:
141 |         if not os.path.isdir(post_file_dir):
142 |             os.makedirs(post_file_dir)
143 |         if os.path.isfile(f"{post_file_path}.json.tmp"):
144 |             os.remove(f"{post_file_path}.json.tmp")
145 |         print_log(component, f"writing {post_file_name}.json")
146 |         with open(f"{post_file_path}.json.tmp", "w", encoding='utf8') as post_file:
147 |             post_file.write(json.dumps(post_j, ensure_ascii=False))
148 |         if os.path.isfile(f"{post_file_path}.json"):
149 |             os.remove(f"{post_file_path}.json")
150 |         os.rename(f"{post_file_path}.json.tmp", f"{post_file_path}.json")
151 |     except Exception as e:
152 |         print_log(component, f"failed to write file {post_file_path}")
153 |         print_log(component, str(e))
154 |     if post.backstage_attachment:
155 |         handle_post_attachments(component, post.backstage_attachment, post_file_path)
156 | 
157 | def get_timestamp_metadata(post):
158 |     timestamp_obj = {}
159 |     # last updated time
160 |     timestamp_obj["lastUpdatedTimestamp"] = int(datetime.utcnow().timestamp())
161 |     # string as it appears on YouTube
162 |     timestamp_obj["lastPublishedString"] = post.get_published_string()
163 |     return timestamp_obj
164 | 
165 | def handle_post_timestamp(post, path):
166 |     timestamp_obj = get_timestamp_metadata(post)
167 |     # code removed for the time being to prevent trashing files of inexperienced users
168 |     # the closest UTC timestamp, and the seconds difference from the furthest UTC timestamp
169 |     # diff_to_nearest_possible_date, timestamp_obj["timestampAccuracy"], timestamp_obj["is_edited"] = get_time_diff_from_text(timestamp_obj["lastPublishedString"])
170 |     # if diff_to_nearest_possible_date and timestamp_obj["timestampAccuracy"]:
171 |     #     timestamp_obj["closestTimestamp"] = timestamp_obj["lastUpdatedTimestamp"] - diff_to_nearest_possible_date
172 |     #     if os.path.isfile(f"{path}.json"):
173 |     #         try:
174 |     #             with open(f"{path}.json", "r") as previous_post_file:
175 |     #                 previous_post_j = json.load(previous_post_file)
176 |     #             if "_published" in previous_post_j:
177 |     #                 previous_timestamp_obj = previous_post_j["_published"]
178 |     #                 diff_since_last_update = timestamp_obj["lastUpdatedTimestamp"] - previous_timestamp_obj["lastUpdatedTimestamp"]
179 |     #                 if previous_timestamp_obj["lastPublishedString"] == timestamp_obj["lastPublishedString"]:
180 |     #                     # update accuracy based on time between current and last update
181 |     #                     timestamp_obj["timestampAccuracy"] = previous_timestamp_obj["timestampAccuracy"] - diff_since_last_update
182 |     #                 elif diff_since_last_update < previous_timestamp_obj["timestampAccuracy"]:
183 |     #                     # time between change in update is less than previous accuracy, should be safe to change
184 |     #                     # i.e. if you save a post 3 days after publish, accuracy is 72-96 hours
185 |     #                     # if you then update 364 days after publish, and update again 1 year after publish
186 |     #                     # the diff since last update is 24 hours, which is better than before
187 |     #                     timestamp_obj["timestampAccuracy"] = diff_since_last_update
188 |     #                 else:
189 |     #                     # keep previous accuracy
190 |     #                     timestamp_obj["timestampAccuracy"] = previous_timestamp_obj["timestampAccuracy"]
191 |     #                 if previous_timestamp_obj["closestTimestamp"] < timestamp_obj["closestTimestamp"]:
192 |     #                     # if closest timestamp is not better than previous, keep previous
193 |     #                     timestamp_obj["closestTimestamp"] = previous_timestamp_obj["closestTimestamp"]
194 |     #         except Exception as e:
195 |     #             print_log("community post", f"failed to open previously downloaded post {post.post_id}")
196 |     #             print_log("community post", str(e))
197 | 
198 | def get_time_diff_from_text(published_text):
199 |     post_date_m = re.search(POST_DATE_REGEX, published_text)
200 |     if post_date_m:
201 |         mag = int(post_date_m.group("magnitude"))
202 |         unit = post_date_m.group("unit")
203 |         delta_secs = 0
204 |         accuracy = 0
205 |         if unit == "month":
206 |             # absolute madness beyond this point
207 |             delta_secs += TIME_FACTORS["day"] * 28
208 |             if mag != 1:
209 |                 delta_secs += (mag - 1) * TIME_FACTORS["day"] * 30.4
210 |             accuracy = TIME_FACTORS["day"] * 31 - 1
211 |         else:
212 |             delta_secs = mag * TIME_FACTORS[unit]
213 |             accuracy = TIME_FACTORS[unit] - 1
214 |         edited = False
215 |         if post_date_m.group("edited"):
216 |             edited = True
217 |         return (delta_secs, accuracy, edited)
218 |     else:
219 |         print_log("community post:date", f"could not parse '{published_text}', open an issue?")
220 |         return (None, None, None)
221 | 
222 | def handle_post_attachments(component, attachment, path):
223 |     if "postMultiImageRenderer" in attachment:
224 |         num_images = len(attachment["postMultiImageRenderer"]["images"])
225 |         print_log(component, f"downloading {num_images} attached images")
226 |         for image_i in range(0, num_images):
227 |             handle_post_attachments(component, attachment["postMultiImageRenderer"]["images"][image_i], f"{path}_{image_i}")
228 |     elif "backstageImageRenderer" in attachment:
229 |         print_log(component, f"downloading image")
230 |         image_url = attachment["backstageImageRenderer"]["image"]["thumbnails"][-1]["url"].split("=", 1)[0] + "=s0?imgmax=0"
231 |         image_r = requests.get(image_url, stream=True, allow_redirects=True)
232 |         image_ext = image_r.headers["Content-Type"].split("/", 1)[1].replace("jpeg", "jpg")
233 |         image_path = f"{path}.{image_ext}"
234 |         if not os.path.isfile(image_path):
235 |             if os.path.isfile(f"{image_path}.tmp"):
236 |                 os.remove(f"{image_path}.tmp")
237 |             with open(f"{image_path}.tmp", "wb") as image_file:
238 |                 for chunk in image_r.iter_content(BLOCK_SIZE):
239 |                     image_file.write(chunk)
240 |             os.rename(f"{image_path}.tmp", image_path)
241 |         else:
242 |             print_log(component, "image already downloaded, skipping")
243 |     elif "videoRenderer" in attachment:
244 |         thumb_url = None
245 |         if "videoId" in attachment["videoRenderer"]:
246 |             video_id = attachment["videoRenderer"]["videoId"]
247 |             thumb_url = f"https://i.ytimg.com/vi/{video_id}/maxresdefault.jpg"
248 |         elif "thumbnail" in attachment["videoRenderer"]:
249 |             thumb_url = urlparse.urljoin(attachment["videoRenderer"]["thumbnail"]["thumbnails"][-1]["url"], "maxresdefault.jpg")
250 |             print_log(component, "could not get video ID, video may be private or deleted")
251 |         if thumb_url:
252 |             print_log(component, f"downloading thumbnail")
253 |             thumb_r = requests.get(thumb_url, stream=True, allow_redirects=True)
254 |             thumb_ext = thumb_r.headers["Content-Type"].split("/", 1)[1].replace("jpeg", "jpg")
255 |             thumb_path = f"{path}_thumb.{thumb_ext}"
256 |             if not os.path.isfile(thumb_path):
257 |                 if os.path.isfile(f"{thumb_path}.tmp"):
258 |                     os.remove(f"{thumb_path}.tmp")
259 |                 with open(f"{thumb_path}.tmp", "wb") as thumb_file:
260 |                     for chunk in thumb_r.iter_content(BLOCK_SIZE):
261 |                         thumb_file.write(chunk)
262 |                 os.rename(f"{thumb_path}.tmp", thumb_path)
263 |             else:
264 |                 print_log(component, "thumbnail already downloaded, skipping")
265 |         else:
266 |             print_log(component, "could not get video thumbnail url for post")
267 | 
268 | def clean_name(text):
269 |     return re.sub(CLEAN_FILENAME_KINDA, "_", text)
270 | 
271 | def print_log(component, message, end="\n"):
272 |     print(f"[{component}] {message}", end=end)
273 | 
274 | if __name__ == "__main__":
275 |     parser = argparse.ArgumentParser()
276 |     args = get_arguments()
277 |     # set cookies for retrieving posts that need auth
278 |     if args.cookies:
279 |         use_cookies(args.cookies)
280 |     else:
281 |         use_default_cookies()
282 |     usable_archive = None
283 |     if args.post_archive:
284 |         #making sure the directory of the log exists, create if necessary
285 |         log_path = os.path.dirname(args.post_archive)
286 |         if not os.path.isdir(log_path):
287 |             try:
288 |                 os.makedirs(log_path)
289 |             except:
290 |                 print_log("ytct", "failed to create log directory")
291 |         
292 |         try:
293 |             open(args.post_archive, "a")
294 |             usable_archive = args.post_archive
295 |         except:
296 |             print_log("ytct", f"cannot write to the archive file {args.post_archive}, continuing...")
297 |     if not os.path.isdir(args.directory):
298 |         try:
299 |             os.makedirs(args.directory)
300 |         except:
301 |             print_log("ytct", "failed to create output directory")
302 |             sys.exit(1)
303 |     for link in args.links:
304 |         post_id_m = re.search(POST_REGEX, link)
305 |         channel_id_m = re.search(CHANNEL_REGEX, link)
306 |         if post_id_m:
307 |             post_id = post_id_m.group("post_id")
308 |             get_post(post_id, usable_archive)
309 |         elif channel_id_m:
310 |             channel_handle = channel_id_m.group("channel_handle")
311 |             if channel_handle:
312 |                 channel_id = get_channel_id_from_handle(channel_handle)
313 |             else:
314 |                 channel_id = channel_id_m.group("channel_id")
315 |             get_channel_posts(channel_id, usable_archive)
316 |         else:
317 |             print_log("ytct", f"could not parse link/id {link}")
318 |     print_log("ytct", "finished")
319 | 


--------------------------------------------------------------------------------