├── readme.md ├── youtube-community-tab ├── .flake8 ├── .gitignore ├── LICENSE ├── README.md ├── pyproject.toml ├── setup.cfg ├── setup.py ├── src │ └── youtube_community_tab │ │ ├── __init__.py │ │ ├── comment.py │ │ ├── community_tab.py │ │ ├── helpers │ │ ├── __init__.py │ │ ├── clean_items.py │ │ └── utils.py │ │ ├── post.py │ │ ├── reply.py │ │ └── requests_handler.py └── tests │ ├── test_actions.py │ ├── test_community_tab.py │ ├── test_membership.py │ └── test_post.py └── ytct.py /readme.md: -------------------------------------------------------------------------------- 1 | # YouTube Community Tab 2 | 3 | This repo includes a fork of [bot-jonas/youtube-community-tab](https://github.com/bot-jonas/youtube-community-tab), as well as a script to scrape and dump community tab posts as .json files, along with all attached images and thumbnails. 4 | 5 | ## Setup / Update 6 | 7 | Since this version of the youtube-community-tab package is slightly modified, you will need to install/update it from this repo to guarantee compatibility. 8 | ```sh 9 | cd youtube-community-tab 10 | pip install . 11 | ``` 12 | 13 | ## Example Usage 14 | 15 | ```sh 16 | python ytct.py --cookies cookies-youtube-com.txt -d "./Ninomae Ina_nis Ch. hololive-EN" https://www.youtube.com/channel/UCMwGHR0BTZuLsmjY_NT5Pwg/community 17 | OR 18 | ./ytct.py --cookies cookies-youtube-com.txt -d "./Ninomae Ina_nis Ch. hololive-EN" https://www.youtube.com/@NinomaeInanis/community 19 | ``` 20 | 21 | ## Arguments 22 | 23 | ``` 24 | -h, --help show this help message and exit 25 | --cookies COOKIES FILE a Netscape format cookies file, allows the script to 26 | retrieve Membership-only posts 27 | -d, --directory DIRECTORY save directory (defaults to current) 28 | --post-archive FILE download only posts not listed in the archive file 29 | and record the IDs of newly downloaded posts 30 | --dates write information about the post publish date 31 | -r, --reverse set download order from oldest to newest post. 32 | ``` 33 | -------------------------------------------------------------------------------- /youtube-community-tab/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 200 3 | max-complexity = 18 -------------------------------------------------------------------------------- /youtube-community-tab/.gitignore: -------------------------------------------------------------------------------- 1 | */__pycache__/* 2 | */*.egg-info/* 3 | *.sqlite 4 | build/* 5 | dist/* 6 | tests/cookies.txt 7 | -------------------------------------------------------------------------------- /youtube-community-tab/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Jonas Alves 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /youtube-community-tab/README.md: -------------------------------------------------------------------------------- 1 | # youtube_community_tab 2 | 3 | Python3 interface to YouTube community tab, it handles posts, comments and comment replies. 4 | 5 | This is a fork from [bot-jonas/youtube-community-tab](https://github.com/bot-jonas/youtube-community-tab) which aims to return more comprehensive objects with JSON and support cookies for posts limited to memberships. 6 | 7 | ## Community Tab 8 | 9 | ```python 10 | from youtube_community_tab.community_tab import CommunityTab 11 | import json 12 | 13 | 14 | def indent_print(text, level=1): 15 | indent = level * "\t" 16 | print(indent + ("\n" + indent).join(text.split("\n"))) 17 | 18 | 19 | # Cache expiration 20 | EXPIRATION_TIME = 1 * 60 * 60 21 | 22 | ct = CommunityTab("vsauce1") 23 | 24 | # Load initial posts 25 | ct.load_posts(expire_after=EXPIRATION_TIME) 26 | 27 | # Load more posts 28 | while(ct.posts_continuation_token and len(ct.posts) < 40): 29 | ct.load_posts(expire_after=EXPIRATION_TIME) 30 | 31 | post = ct.posts[0] 32 | print(f"[Post {post.post_id}]") 33 | indent_print(post.get_text()) 34 | 35 | print("\n[Thumbnails]") 36 | print(json.dumps(post.get_thumbnails()[0], indent=4)) 37 | 38 | # Load initial comments 39 | post.load_comments(expire_after=EXPIRATION_TIME) 40 | 41 | # Load more comments 42 | while(post.comments_continuation_token and len(post.comments) < 100): 43 | post.load_comments(expire_after=EXPIRATION_TIME) 44 | 45 | comment = post.comments[1] 46 | print(f"\n[Comment {comment.comment_id}]") 47 | indent_print(comment.get_text()) 48 | 49 | # Load initial comment replies 50 | comment.load_replies(expire_after=EXPIRATION_TIME) 51 | 52 | # Load more comment replies 53 | while(comment.replies_continuation_token and len(comment.replies) < 10): 54 | comment.load_replies(expire_after=EXPIRATION_TIME) 55 | 56 | reply = comment.replies[0] 57 | print(f"\n[Reply {reply.reply_id}]") 58 | indent_print(reply.get_text()) 59 | 60 | ``` 61 | 62 | Output: 63 | 64 | ``` 65 | [Post UgkxzeM19x_He9LEoerdLOHwZJsqIwamUnTj] 66 | THANK YOU! 67 | 68 | WE RAISED $20,180 for the Alzheimer's Association!!! 69 | The winner of this beautiful cube of my beard hairs will be announced November 15th!! 70 | 71 | As you all know, we also donate a portion of all proceeds from the Curiosity Box to Alzheimer's research; and there's never been a better time to do a favor for your brain and everyone else's: 72 | 73 | RIGHT NOW: subscribe with code "BEST" and I'll send you our newest box *and* throw in our BEST-OF BOX completely FREE!!! 74 | 75 | 76 | https://www.curiositybox.com 77 | 78 | [Thumbnails] 79 | [ 80 | { 81 | "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s288-c-fcrop64=1,1e6d0000e38bffff-nd-v1", 82 | "width": 288, 83 | "height": 288 84 | }, 85 | { 86 | "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s400-c-fcrop64=1,1e6d0000e38bffff-nd-v1", 87 | "width": 400, 88 | "height": 400 89 | }, 90 | { 91 | "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s462-c-fcrop64=1,1e6d0000e38bffff-nd-v1", 92 | "width": 462, 93 | "height": 462 94 | } 95 | ] 96 | 97 | [Comment UgyTIomDXMuKf3NTo294AaABAg] 98 | Thank you for doing this. Both my grandparents are affected by alzheimer's disease. It is difficult to watch a highly creative woman and an electrical engineer fade away. 99 | 100 | [Reply UgyTIomDXMuKf3NTo294AaABAg.9TtQ3j7qvll9TtqSmVNrJu] 101 | Hey a heart 102 | ``` 103 | 104 | ## Post 105 | 106 | ```python 107 | from youtube_community_tab.post import Post 108 | import json 109 | 110 | 111 | def indent_print(text, level=1): 112 | indent = level * "\t" 113 | print(indent + ("\n" + indent).join(text.split("\n"))) 114 | 115 | 116 | # Cache expiration 117 | EXPIRATION_TIME = 1 * 60 * 60 118 | 119 | post = Post.from_post_id("UgkxzeM19x_He9LEoerdLOHwZJsqIwamUnTj") 120 | print(f"[Post {post.post_id}]") 121 | indent_print(post.get_text()) 122 | 123 | print("\n[Thumbnails]") 124 | print(json.dumps(post.get_thumbnails()[0], indent=4)) 125 | 126 | # Load initial comments 127 | post.load_comments(expire_after=EXPIRATION_TIME) 128 | 129 | # Load more comments 130 | while(post.comments_continuation_token and len(post.comments) < 100): 131 | post.load_comments(expire_after=EXPIRATION_TIME) 132 | 133 | comment = post.comments[1] 134 | print(f"\n[Comment {comment.comment_id}]") 135 | indent_print(comment.get_text()) 136 | 137 | # Load initial comment replies 138 | comment.load_replies(expire_after=EXPIRATION_TIME) 139 | 140 | # Load more comment replies 141 | while(comment.replies_continuation_token and len(comment.replies) < 10): 142 | comment.load_replies(expire_after=EXPIRATION_TIME) 143 | 144 | reply = comment.replies[0] 145 | print(f"\n[Reply {reply.reply_id}]") 146 | indent_print(reply.get_text()) 147 | 148 | ``` 149 | 150 | Output: 151 | ``` 152 | [Post UgkxzeM19x_He9LEoerdLOHwZJsqIwamUnTj] 153 | THANK YOU! 154 | 155 | WE RAISED $20,180 for the Alzheimer's Association!!! 156 | The winner of this beautiful cube of my beard hairs will be announced November 15th!! 157 | 158 | As you all know, we also donate a portion of all proceeds from the Curiosity Box to Alzheimer's research; and there's never been a better time to do a favor for your brain and everyone else's: 159 | 160 | RIGHT NOW: subscribe with code "BEST" and I'll send you our newest box *and* throw in our BEST-OF BOX completely FREE!!! 161 | 162 | 163 | https://www.curiositybox.com 164 | 165 | [Thumbnails] 166 | [ 167 | { 168 | "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s288-c-fcrop64=1,1e6d0000e38bffff-nd-v1", 169 | "width": 288, 170 | "height": 288 171 | }, 172 | { 173 | "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s400-c-fcrop64=1,1e6d0000e38bffff-nd-v1", 174 | "width": 400, 175 | "height": 400 176 | }, 177 | { 178 | "url": "https://yt3.ggpht.com/DJhBHUy1SyM2XpjC1ObZyrt8llJ-qG6svLapmaZgU-wmo5rVnWR93kJMrtz85XI9EKSt395Cvziu-JE=s462-c-fcrop64=1,1e6d0000e38bffff-nd-v1", 179 | "width": 462, 180 | "height": 462 181 | } 182 | ] 183 | 184 | [Comment UgyTIomDXMuKf3NTo294AaABAg] 185 | Thank you for doing this. Both my grandparents are affected by alzheimer's disease. It is difficult to watch a highly creative woman and an electrical engineer fade away. 186 | 187 | [Reply UgyTIomDXMuKf3NTo294AaABAg.9TtQ3j7qvll9TtqSmVNrJu] 188 | Hey a heart 189 | ``` 190 | 191 | ## Authentication/Membership 192 | 193 | To access authenticated posts, like membership only posts, you need to provide cookies to authenticate your requests. 194 | 195 | ```python 196 | from http import cookiejar 197 | from youtube_community_tab.requests_handler import requests_cache 198 | from youtube_community_tab.community_tab import CommunityTab 199 | 200 | cookie_jar = cookiejar.MozillaCookieJar("cookies.txt") 201 | cookie_jar.load() 202 | requests_cache.cookies = cookie_jar 203 | 204 | ct = CommunityTab("UCMwGHR0BTZuLsmjY_NT5Pwg") 205 | ct.load_posts() 206 | 207 | membership_post = None 208 | while ct.posts_continuation_token: 209 | for post in ct.posts: 210 | if post.sponsor_only_badge is not None: 211 | membership_post = post 212 | break 213 | 214 | if(membership_post is not None): 215 | break 216 | 217 | ct.load_posts(expire_after=EXPIRATION_TIME) 218 | 219 | assert(membership_post is not None) 220 | ``` 221 | -------------------------------------------------------------------------------- /youtube-community-tab/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 160 -------------------------------------------------------------------------------- /youtube-community-tab/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = youtube_community_tab 3 | version = 0.2.2.1 4 | description = A python3 module to handle YouTube Community Tab 5 | 6 | [options] 7 | package_dir = 8 | =src 9 | packages = find: 10 | install_requires = 11 | requests_cache 12 | python_requires = >=3.7 13 | 14 | [options.packages.find] 15 | where=src 16 | -------------------------------------------------------------------------------- /youtube-community-tab/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | here = os.path.abspath(os.path.dirname(__file__)) 5 | 6 | with open(os.path.join(here, "README.md"), "r") as f: 7 | long_description = f.read() 8 | 9 | setup( 10 | name="youtube_community_tab", 11 | version="0.2.3.1", 12 | description="A python3 module to handle YouTube Community Tab", 13 | long_description_content_type="text/markdown", 14 | long_description=long_description, 15 | url="https://github.com/CetaceanNation/youtube-community-tab", 16 | package_dir={"": "src"}, 17 | install_requires=[ 18 | "requests_cache", 19 | ], 20 | packages=find_packages(where="src"), 21 | zip_safe=False, 22 | ) 23 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ( 2 | helpers 3 | ) 4 | 5 | from .comment import Comment 6 | from .community_tab import CommunityTab 7 | from .post import Post 8 | from .reply import Reply 9 | from .requests_handler import requests_cache 10 | 11 | __all__ = [ 12 | "helpers", 13 | "Comment", 14 | "CommunityTab", 15 | "Post", 16 | "Reply", 17 | "requests_cache" 18 | ] 19 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/comment.py: -------------------------------------------------------------------------------- 1 | import json 2 | from requests.utils import dict_from_cookiejar 3 | from base64 import urlsafe_b64encode 4 | 5 | from .requests_handler import requests_cache 6 | from .helpers.utils import safely_get_value_from_key, get_auth_header, CLIENT_VERSION 7 | from .reply import Reply 8 | 9 | 10 | class Comment(object): 11 | FORMAT_URLS = { 12 | "POST": "https://www.youtube.com/post/{}", 13 | # HARD_CODED: This key seems to be constant to everyone, IDK 14 | "BROWSE_ENDPOINT": "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", 15 | "UPDATE_COMMENT_ENDPOINT": "https://www.youtube.com/youtubei/v1/comment/update_comment?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false", 16 | "PERFORM_COMMENT_ACTION_ENDPOINT": "https://www.youtube.com/youtubei/v1/comment/perform_comment_action?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false", 17 | "FIXED_COMMENT": "https://www.youtube.com/channel/{}/community?lc={}&lb={}", 18 | } 19 | 20 | def __init__( 21 | self, 22 | post_id, 23 | comment_id, 24 | channel_id=None, 25 | author=None, 26 | content_text=None, 27 | vote_count=None, 28 | replies_continuation_token=None, 29 | click_tracking_params=None, 30 | visitor_data=None, 31 | session_index="0" 32 | ): 33 | self.post_id = post_id 34 | self.comment_id = comment_id 35 | self.channel_id = channel_id 36 | self.author = author 37 | self.content_text = content_text 38 | self.vote_count = vote_count 39 | self.replies_continuation_token = replies_continuation_token 40 | self.click_tracking_params = click_tracking_params 41 | self.visitor_data = visitor_data 42 | self.session_index = session_index 43 | self.replies = [] 44 | 45 | def as_json(self): 46 | return { 47 | "comment_id": self.comment_id, 48 | "post_id": self.post_id, 49 | "channel_id": self.channel_id, 50 | "author": self.author, 51 | "content_text": self.content_text, 52 | "vote_count": self.vote_count 53 | } 54 | 55 | def __str__(self): 56 | return json.dumps(self.as_json(), indent=4) 57 | 58 | def __repr__(self): 59 | return self.__str__() 60 | 61 | def get_text(self): 62 | if self.content_text is not None: 63 | return "".join([run["text"] for run in self.content_text["runs"]]) 64 | return None 65 | 66 | def load_replies(self, expire_after=0): 67 | headers = { 68 | "Accept-Language": "en-US,en;q=0.9", 69 | "x-origin": "https://www.youtube.com", 70 | "Referer": Comment.FORMAT_URLS["POST"].format(self.post_id) 71 | } 72 | 73 | # Add authorization header 74 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 75 | if "SAPISID" in current_cookies: 76 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 77 | 78 | if self.replies_continuation_token: 79 | headers.update( 80 | { 81 | "X-Goog-AuthUser": self.session_index, 82 | "X-Origin": "https://www.youtube.com", 83 | "X-Youtube-Client-Name": "1", 84 | "X-Youtube-Client-Version": CLIENT_VERSION 85 | } 86 | ) 87 | 88 | json_body = { 89 | "context": { 90 | "client": { 91 | "clientName": "WEB", 92 | "clientVersion": CLIENT_VERSION, 93 | "originalUrl": Comment.FORMAT_URLS["POST"].format(self.post_id), 94 | "visitorData": self.visitor_data 95 | } 96 | }, 97 | "continuation": self.replies_continuation_token, 98 | "clickTracking": {"clickTrackingParams": self.click_tracking_params} 99 | } 100 | 101 | r = requests_cache.post(Comment.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers) 102 | 103 | data = r.json() 104 | append = data["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"] 105 | self.click_tracking_params = data["trackingParams"] 106 | continuation_items = safely_get_value_from_key(append, "continuationItems", default=[]) 107 | 108 | self.append_replies_from_items(continuation_items) 109 | 110 | def append_replies_from_items(self, items): 111 | there_is_no_continuation_token = True 112 | for item in items: 113 | kind = list(item.keys())[0] 114 | 115 | if kind == "commentRenderer": 116 | self.replies.append(Reply.from_data(item[kind])) 117 | elif kind == "continuationItemRenderer": 118 | if "continuationEndpoint" in item[kind]: 119 | self.replies_continuation_token = item[kind]["continuationEndpoint"]["continuationCommand"]["token"] 120 | there_is_no_continuation_token = False 121 | elif "button" in item[kind]: 122 | self.replies_continuation_token = item[kind]["button"]["buttonRenderer"]["command"]["continuationCommand"]["token"] 123 | there_is_no_continuation_token = False 124 | 125 | if there_is_no_continuation_token: 126 | self.replies_continuation_token = False 127 | 128 | @staticmethod 129 | def from_data(data, post_id, channel_id, replies_continuation_token, click_tracking_params, visitor_data, session_index): 130 | comment = Comment( 131 | post_id, 132 | data["commentId"], 133 | channel_id=channel_id, 134 | content_text=safely_get_value_from_key(data, "contentText"), 135 | author={ 136 | "authorText": safely_get_value_from_key(data, "authorText"), 137 | "authorThumbnail": safely_get_value_from_key(data, "authorThumbnail"), 138 | "authorEndpoint": safely_get_value_from_key(data, "authorEndpoint", "browseEndpoint"), 139 | "authorIsChannelOwner": safely_get_value_from_key(data, "authorIsChannelOwner"), 140 | "sponsorCommentBadge": safely_get_value_from_key(data, "sponsorCommentBadge"), 141 | }, 142 | vote_count=safely_get_value_from_key(data, "voteCount"), 143 | replies_continuation_token=replies_continuation_token, 144 | click_tracking_params=click_tracking_params, 145 | visitor_data=visitor_data, 146 | session_index=session_index 147 | ) 148 | 149 | comment.raw_data = data 150 | 151 | return comment 152 | 153 | @staticmethod 154 | def get_fixed_comment_params(comment_id, post_id, channel_id): 155 | part1 = [ 156 | b"\x12\tcommunity\xB8\x01\x00\xCA\x01", 157 | (32 + len(post_id)).to_bytes(1, "big"), 158 | b"\x82\x01", 159 | len(comment_id).to_bytes(1, "big"), 160 | comment_id.encode(), 161 | b"\xB2\x01", 162 | len(post_id).to_bytes(1, "big"), 163 | post_id.encode(), 164 | b"\xEA\x02\x04\x10\x01\x18\x01\xAA\x03", 165 | (84 + len(post_id)).to_bytes(1, "big"), 166 | b"\x22", 167 | (64 + len(post_id)).to_bytes(1, "big"), 168 | b"0\x00\x82\x01", 169 | len(comment_id).to_bytes(1, "big"), 170 | comment_id.encode(), 171 | b"\xD8\x01\x01\xEA\x01", 172 | len(post_id).to_bytes(1, "big"), 173 | post_id.encode(), 174 | b"\xF2\x01", 175 | len(channel_id).to_bytes(1, "big"), 176 | channel_id.encode(), 177 | b"B\x10comments-section" 178 | ] 179 | 180 | part1 = urlsafe_b64encode(b"".join(part1)).replace(b"=", b"%3D") 181 | 182 | params = [ 183 | b"\xe2\xa9\x85\xb2\x02", 184 | (83 + 3 * len(post_id)).to_bytes(1, "big"), 185 | b"\x02\x12", 186 | len(channel_id).to_bytes(1, "big"), 187 | channel_id.encode(), 188 | b"\x1A", 189 | (54 + 3 * len(post_id)).to_bytes(1, "big"), 190 | b"\x02", 191 | part1 192 | ] 193 | 194 | params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D") 195 | 196 | return params 197 | 198 | @staticmethod 199 | def from_ids(comment_id, post_id, channel_id, expire_after=0): 200 | fixed_comment_url = Comment.FORMAT_URLS["FIXED_COMMENT"].format(channel_id, comment_id, post_id) 201 | headers = { 202 | "Accept-Language": "en-US,en;q=0.9", 203 | "x-origin": "https://www.youtube.com", 204 | "Referer": fixed_comment_url 205 | } 206 | 207 | # Add authorization header 208 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 209 | if "SAPISID" in current_cookies: 210 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 211 | 212 | c = Comment.get_fixed_comment_params(comment_id, post_id, channel_id) 213 | 214 | json_body = { 215 | "context": { 216 | "client": { 217 | "clientName": "WEB", 218 | "clientVersion": CLIENT_VERSION, 219 | "originalUrl": fixed_comment_url, 220 | } 221 | }, 222 | "continuation": c 223 | } 224 | 225 | r = requests_cache.post(Comment.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers) 226 | 227 | comment_data = safely_get_value_from_key( 228 | r.json(), "onResponseReceivedEndpoints", 1, "reloadContinuationItemsCommand", "continuationItems", 0, "commentThreadRenderer" 229 | ) 230 | 231 | if comment_data is not None: 232 | return Comment.from_data( 233 | comment_data["comment"]["commentRenderer"], 234 | post_id, 235 | channel_id, 236 | safely_get_value_from_key( 237 | comment_data, 238 | "replies", 239 | "commentRepliesRenderer", 240 | "contents", 241 | 0, 242 | "continuationItemRenderer", 243 | "continuationEndpoint", 244 | "continuationCommand", 245 | "token" 246 | ), 247 | safely_get_value_from_key( 248 | comment_data, 249 | "replies", 250 | "commentRepliesRenderer", 251 | "contents", 252 | 0, 253 | "continuationItemRenderer", 254 | "continuationEndpoint", 255 | "clickTrackingParams" 256 | ), 257 | None, 258 | None 259 | ) 260 | 261 | @staticmethod 262 | def get_update_comment_params(comment_id, post_id, channel_id): 263 | params = [ 264 | b"\n", 265 | len(comment_id).to_bytes(1, "big"), 266 | comment_id.encode(), 267 | b"*\x02\b\x00@\x01R", 268 | len(post_id).to_bytes(1, "big"), 269 | post_id.encode(), 270 | b"Z", 271 | len(channel_id).to_bytes(1, "big"), 272 | channel_id.encode() 273 | ] 274 | 275 | params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D") 276 | 277 | return params 278 | 279 | def update_comment(self, comment_text): 280 | return Comment._update_comment(comment_text, comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id) 281 | 282 | @staticmethod 283 | def _update_comment(comment_text, update_comment_params=None, comment_id=None, post_id=None, channel_id=None): 284 | if update_comment_params is None: 285 | update_comment_params = Comment.get_update_comment_params(comment_id, post_id, channel_id) 286 | 287 | headers = { 288 | "Accept-Language": "en-US,en;q=0.9", 289 | "x-origin": "https://www.youtube.com" 290 | } 291 | 292 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 293 | if "SAPISID" in current_cookies: 294 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 295 | 296 | json_body = { 297 | "context": { 298 | "client": { 299 | "clientName": "WEB", 300 | "clientVersion": CLIENT_VERSION 301 | } 302 | }, 303 | "updateCommentParams": update_comment_params, 304 | "commentText": comment_text 305 | } 306 | 307 | r = requests_cache.post( 308 | Comment.FORMAT_URLS["UPDATE_COMMENT_ENDPOINT"], 309 | json=json_body, 310 | headers=headers 311 | ) 312 | 313 | return r.json() 314 | 315 | @staticmethod 316 | def get_delete_comment_params(comment_id, post_id, channel_id): 317 | params = [ 318 | b"\b\x06\x10\x07\x1A", 319 | len(comment_id).to_bytes(1, "big"), 320 | comment_id.encode(), 321 | b"0\x00J\x15115587043600121621724P\x00\xA8\x01\x01\xB2\x01", 322 | len(post_id).to_bytes(1, "big"), 323 | post_id.encode(), 324 | b"\xBA\x01", 325 | len(channel_id).to_bytes(1, "big"), 326 | channel_id.encode(), 327 | b"\xF0\x01\x01" 328 | ] 329 | 330 | params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D") 331 | 332 | return params 333 | 334 | def delete_comment(self): 335 | return Comment._delete_comment(comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id) 336 | 337 | @staticmethod 338 | def _delete_comment(delete_comment_params=None, comment_id=None, post_id=None, channel_id=None): 339 | if delete_comment_params is None: 340 | delete_comment_params = Comment.get_delete_comment_params(comment_id, post_id, channel_id) 341 | 342 | return Comment.perform_action(delete_comment_params) 343 | 344 | @staticmethod 345 | def get_dislike_comment_params(value, comment_id, post_id, channel_id): 346 | params = [ 347 | b"\b\x04\x10\x07\x1A", 348 | len(comment_id).to_bytes(1, "big"), 349 | comment_id.encode(), 350 | b"0\x008", 351 | (not value).to_bytes(1, "big"), 352 | b"J\x15115587043600121621724P\x00\xA8\x01\x01\xB2\x01", 353 | len(post_id).to_bytes(1, "big"), 354 | post_id.encode(), 355 | b"\xBA\x01", 356 | len(channel_id).to_bytes(1, "big"), 357 | channel_id.encode(), 358 | b"\xF0\x01\x01" 359 | ] 360 | 361 | params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D") 362 | 363 | return params 364 | 365 | def set_dislike_comment(self, value=True): 366 | return Comment._set_dislike_comment(value, comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id) 367 | 368 | @staticmethod 369 | def _set_dislike_comment(value, dislike_comment_params=None, comment_id=None, post_id=None, channel_id=None): 370 | if dislike_comment_params is None: 371 | dislike_comment_params = Comment.get_dislike_comment_params(value, comment_id, post_id, channel_id) 372 | 373 | return Comment.perform_action(dislike_comment_params) 374 | 375 | @staticmethod 376 | def get_like_comment_params(value, comment_id, post_id, channel_id): 377 | params = [ 378 | b"\b\x05\x10\x07\x1A", 379 | len(comment_id).to_bytes(1, "big"), 380 | comment_id.encode(), 381 | b"0\x008", 382 | (not value).to_bytes(1, "big"), 383 | b"J\x15115587043600121621724P\x00\xA8\x01\x01\xB2\x01", 384 | len(post_id).to_bytes(1, "big"), 385 | post_id.encode(), 386 | b"\xBA\x01", 387 | len(channel_id).to_bytes(1, "big"), 388 | channel_id.encode(), 389 | b"\xF0\x01\x01" 390 | ] 391 | 392 | params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D") 393 | 394 | return params 395 | 396 | def set_like_comment(self, value=True): 397 | return Comment._set_like_comment(value, comment_id=self.comment_id, post_id=self.post_id, channel_id=self.channel_id) 398 | 399 | @staticmethod 400 | def _set_like_comment(value, like_comment_params=None, comment_id=None, post_id=None, channel_id=None): 401 | if like_comment_params is None: 402 | like_comment_params = Comment.get_like_comment_params(value, comment_id, post_id, channel_id) 403 | 404 | return Comment.perform_action(like_comment_params) 405 | 406 | @staticmethod 407 | def perform_action(action_params): 408 | headers = { 409 | "Accept-Language": "en-US,en;q=0.9", 410 | "x-origin": "https://www.youtube.com" 411 | } 412 | 413 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 414 | if "SAPISID" in current_cookies: 415 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 416 | 417 | json_body = { 418 | "context": { 419 | "client": { 420 | "clientName": "WEB", 421 | "clientVersion": CLIENT_VERSION, 422 | }, 423 | }, 424 | "actions": [ 425 | action_params 426 | ] 427 | } 428 | 429 | r = requests_cache.post( 430 | Comment.FORMAT_URLS["PERFORM_COMMENT_ACTION_ENDPOINT"], 431 | json=json_body, 432 | headers=headers 433 | ) 434 | 435 | return r.json() 436 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/community_tab.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from requests.utils import dict_from_cookiejar 4 | 5 | from .helpers.utils import safely_get_value_from_key, get_auth_header, CLIENT_VERSION, search_key 6 | from .requests_handler import requests_cache 7 | from .post import Post 8 | 9 | 10 | class CommunityTab(object): 11 | FORMAT_URLS = { 12 | "COMMUNITY_TAB": "https://www.youtube.com/{}/{}/community", 13 | # HARD_CODED: This key seems to be constant to everyone, IDK 14 | "BROWSE_ENDPOINT": "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" 15 | } 16 | 17 | REGEX = { 18 | "YT_INITIAL_DATA": "ytInitialData = ({(?:(?:.|\n)*)?});", 19 | "COMMUNITY_TAB_URL": "^\/.*\/community$" 20 | } 21 | 22 | def __init__(self, channel_name): 23 | self.channel_name = channel_name 24 | 25 | self.posts_continuation_token = None 26 | self.click_tracking_params = None 27 | self.visitor_data = None 28 | self.session_index = "0" 29 | self.posts = [] 30 | self.community_url = None 31 | self.channel_id = None 32 | 33 | def load_posts(self, expire_after=0): 34 | headers = { 35 | "Accept-Language": "en-US,en;q=0.9", 36 | "Referer": self.community_url 37 | } 38 | 39 | # Add authorization header 40 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 41 | if "SAPISID" in current_cookies: 42 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 43 | 44 | if self.posts_continuation_token is None: 45 | try: 46 | # Get posts from community tab enpoint 47 | self.community_url = CommunityTab.FORMAT_URLS["COMMUNITY_TAB"].format("c", self.channel_name) 48 | r = requests_cache.get(self.community_url, expire_after=expire_after, headers=headers) 49 | if r.status_code != 200: 50 | self.community_url = CommunityTab.FORMAT_URLS["COMMUNITY_TAB"].format("channel", self.channel_name) 51 | r = requests_cache.get(self.community_url, expire_after=expire_after, headers=headers) 52 | 53 | if r.status_code != 200: 54 | import sys 55 | 56 | print(f"[Can't get data from the channel_name: {self.channel_name}]") 57 | sys.exit() 58 | 59 | m = re.findall(CommunityTab.REGEX["YT_INITIAL_DATA"], r.text) 60 | data = json.loads(m[0]) 61 | 62 | if self.channel_id is None: 63 | self.channel_id = data["metadata"]["channelMetadataRenderer"]["externalId"] 64 | 65 | except IndexError as e: 66 | print("[Can't find yt_initial_data using the regex]") 67 | raise e 68 | except json.decoder.JSONDecodeError as e: 69 | print("[Can't parse yt_initial_data from the regex]") 70 | raise e 71 | except Exception as e: 72 | print("[Some non-expected exception, probably caused by requests...]") 73 | raise e 74 | 75 | tabs = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"] 76 | community_tab = CommunityTab.get_community_tab(tabs) 77 | community_tab_items = CommunityTab.get_items_from_community_tab(community_tab) 78 | 79 | self.click_tracking_params = CommunityTab.get_click_tracking_params_from_community_tab(community_tab) 80 | self.visitor_data = data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["visitorData"] 81 | self.session_index = str( 82 | safely_get_value_from_key(data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"], "sessionIndex", default="") 83 | ) 84 | self.append_posts_from_items(community_tab_items) 85 | elif self.posts_continuation_token is not False: 86 | headers.update( 87 | { 88 | "X-Goog-AuthUser": self.session_index, 89 | "X-Origin": "https://www.youtube.com", 90 | "X-Youtube-Client-Name": "1", 91 | "X-Youtube-Client-Version": CLIENT_VERSION 92 | } 93 | ) 94 | 95 | json_body = { 96 | "context": { 97 | "client": {"clientName": "WEB", "clientVersion": CLIENT_VERSION, "originalUrl": self.community_url, "visitorData": self.visitor_data} 98 | }, 99 | "continuation": self.posts_continuation_token, 100 | "clickTracking": {"clickTrackingParams": self.click_tracking_params} 101 | } 102 | 103 | r = requests_cache.post(CommunityTab.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers) 104 | 105 | data = r.json() 106 | append = data["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"] 107 | self.click_tracking_params = data["onResponseReceivedEndpoints"][0]["clickTrackingParams"] 108 | self.append_posts_from_items(safely_get_value_from_key(append, "continuationItems", default=[])) 109 | 110 | def append_posts_from_items(self, items): 111 | there_is_no_continuation_token = True 112 | for item in items: 113 | kind = list(item.keys())[0] 114 | 115 | if kind == "backstagePostThreadRenderer": 116 | post_data = item["backstagePostThreadRenderer"]["post"] 117 | self.posts.append(Post.from_data(post_data)) 118 | elif kind == "continuationItemRenderer": 119 | self.posts_continuation_token = item[kind]["continuationEndpoint"]["continuationCommand"]["token"] 120 | there_is_no_continuation_token = False 121 | 122 | if there_is_no_continuation_token: 123 | self.posts_continuation_token = False 124 | 125 | @staticmethod 126 | def get_community_tab(tabs): 127 | for tab in tabs: 128 | if "tabRenderer" in tab and re.match(CommunityTab.REGEX["COMMUNITY_TAB_URL"], tab["tabRenderer"]["endpoint"]["commandMetadata"]["webCommandMetadata"]["url"]): 129 | return tab 130 | raise Exception(f"[Could not find a Community tab in the channel response]") 131 | 132 | @staticmethod 133 | def get_items_from_community_tab(tab): 134 | try: 135 | return tab["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"] 136 | except Exception as e: 137 | print("[Can't get the contents from the tab]") 138 | raise e 139 | 140 | @staticmethod 141 | def get_click_tracking_params_from_community_tab(tab): 142 | try: 143 | return tab["tabRenderer"]["content"]["sectionListRenderer"]["trackingParams"] 144 | except Exception as e: 145 | print("[Can't get tracking params from the tab") 146 | raise e 147 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/helpers/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | safely_get_value_from_key, 3 | save_object_to_file, 4 | safely_pop_value_from_key, 5 | search_key, 6 | get_auth_header, 7 | CLIENT_VERSION 8 | ) 9 | from .clean_items import ( 10 | clean_content_text, 11 | clean_backstage_attachment 12 | ) 13 | 14 | __all__ = [ 15 | "safely_get_value_from_key", 16 | "safely_pop_value_from_key", 17 | "save_object_to_file", 18 | "search_key", 19 | "get_auth_header", 20 | "clean_content_text", 21 | "clean_backstage_attachement", 22 | "CLIENT_VERSION" 23 | ] 24 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/helpers/clean_items.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import parse_qs, unquote, urlparse 2 | from .utils import safely_get_value_from_key as safe 3 | from .utils import safely_pop_value_from_key as safe_pop 4 | 5 | 6 | # lots of returned objects are full of tracking params, client data, duplicate info, etc. this sorta trims the fat. 7 | def clean_content_text(content): 8 | for item in safe(content, "runs", default=[]): 9 | if "navigationEndpoint" in item: 10 | # traditional links 11 | if "urlEndpoint" in item["navigationEndpoint"]: 12 | url = item["navigationEndpoint"]["urlEndpoint"]["url"] 13 | # replace redirects with direct links 14 | if url.startswith("https://www.youtube.com/redirect"): 15 | parsed_url = urlparse(url) 16 | redirect_url = parse_qs(parsed_url.query)["q"][0] 17 | url = unquote(redirect_url) 18 | item["urlEndpoint"] = {"url": url} 19 | item.pop("navigationEndpoint") 20 | # hashtags 21 | elif "browseEndpoint" in item["navigationEndpoint"]: 22 | item.pop("loggingDirectives") 23 | safe_pop(item, "navigationEndpoint", "browseEndpoint", "params") 24 | item["browseEndpoint"] = item["navigationEndpoint"]["browseEndpoint"] 25 | item["browseEndpoint"]["url"] = item["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"] 26 | item.pop("navigationEndpoint") 27 | return content 28 | 29 | def clean_backstage_attachment(attachment): 30 | if attachment: 31 | if "pollRenderer" in attachment: 32 | for choice in attachment["pollRenderer"]["choices"]: 33 | for value in [ 34 | "selectServiceEndpoint", 35 | "deselectServiceEndpoint", 36 | "voteRatioIfSelected", 37 | "votePercentageIfSelected", 38 | "voteRatioIfNotSelected", 39 | "votePercentageIfNotSelected" 40 | ]: 41 | safe_pop(choice, value) 42 | elif "videoRenderer" in attachment: 43 | safe_pop(attachment, "videoRenderer", "navigationEndpoint", "watchEndpoint", "watchEndpointSupportedOnesieConfig") 44 | attachment["videoRenderer"]["watchEndpoint"] = safe(attachment, "videoRenderer", "navigationEndpoint", "watchEndpoint", default={}) 45 | attachment["videoRenderer"]["watchEndpoint"]["url"] = safe( 46 | attachment, "videoRenderer", "navigationEndpoint", "commandMetadata", "webCommandMetadata", "url" 47 | ) 48 | 49 | for long_by_line in safe(attachment, "videoRenderer", "longBylineText", "runs", default=[]): 50 | long_by_line["browseEndpoint"] = long_by_line["navigationEndpoint"]["browseEndpoint"] 51 | long_by_line.pop("navigationEndpoint") 52 | 53 | for short_by_line in safe(attachment, "videoRenderer", "shortBylineText", "runs", default=[]): 54 | short_by_line["browseEndpoint"] = short_by_line["navigationEndpoint"]["browseEndpoint"] 55 | short_by_line.pop("navigationEndpoint") 56 | 57 | for author in safe(attachment, "videoRenderer", "ownerText", "runs", default=[]): 58 | author["browseEndpoint"] = author["navigationEndpoint"]["browseEndpoint"] 59 | 60 | for value in [ 61 | "publishedTimeText", 62 | "navigationEndpoint", 63 | "trackingParams", 64 | "showActionMenu", 65 | "menu", 66 | "channelThumbnailSupportedRenderers", 67 | "thumbnailOverlays" 68 | ]: 69 | safe_pop(attachment, "videoRenderer", value) 70 | elif "backstageImageRenderer" in attachment: 71 | safe_pop(attachment, "backstageImageRenderer", "trackingParams") 72 | elif "postMultiImageRenderer" in attachment: 73 | for image in attachment["postMultiImageRenderer"]["images"]: 74 | safe_pop(image, "backstageImageRenderer", "trackingParams") 75 | return attachment 76 | return None 77 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/helpers/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from hashlib import sha1 4 | 5 | CLIENT_VERSION = "2.20220311.01.00" 6 | 7 | def safely_get_value_from_key(*args, default=None): 8 | obj = args[0] 9 | keys = args[1:] 10 | 11 | for key in keys: 12 | try: 13 | obj = obj[key] 14 | except Exception: 15 | return default 16 | 17 | return obj 18 | 19 | 20 | def safely_pop_value_from_key(*args): 21 | obj = args[0] 22 | keys = args[1:-1] 23 | 24 | for key in keys: 25 | try: 26 | obj = obj[key] 27 | except Exception: 28 | return None 29 | 30 | pop_key = args[-1] 31 | 32 | if pop_key in obj: 33 | obj.pop(pop_key) 34 | 35 | 36 | def search_key(key, data, current_key=[]): 37 | found = [] 38 | 39 | if type(data).__name__ == "dict": 40 | keys = list(data.keys()) 41 | elif type(data).__name__ == "list": 42 | keys = list(range(len(data))) 43 | else: 44 | return [] 45 | 46 | if key in keys: 47 | found.append((current_key + [key], data[key])) 48 | keys.remove(key) 49 | 50 | for k in keys: 51 | found += search_key(key, data[k], current_key=current_key + [k]) 52 | 53 | return found 54 | 55 | 56 | def save_object_to_file(obj, path): 57 | with open(path, "w") as f: 58 | f.write(json.dumps(obj, indent=4)) 59 | 60 | 61 | def get_auth_header(sapisid): 62 | timestring = str(int(time.time())) 63 | return f"SAPISIDHASH {timestring}_" + sha1(" ".join([timestring, sapisid, "https://www.youtube.com"]).encode()).hexdigest() 64 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/post.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from requests.utils import dict_from_cookiejar 4 | from base64 import urlsafe_b64encode 5 | 6 | from .helpers.clean_items import clean_content_text, clean_backstage_attachment 7 | from .helpers.utils import safely_get_value_from_key, search_key, get_auth_header, CLIENT_VERSION 8 | from .requests_handler import requests_cache 9 | from .comment import Comment 10 | 11 | 12 | class Post(object): 13 | FORMAT_URLS = { 14 | "POST": "https://www.youtube.com/post/{}", 15 | # HARD_CODED: This key seems to be constant to everyone, IDK 16 | "BROWSE_ENDPOINT": "https://www.youtube.com/youtubei/v1/browse?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", 17 | "CREATE_COMMENT_ENDPOINT": "https://www.youtube.com/youtubei/v1/comment/create_comment?key=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8&prettyPrint=false" 18 | } 19 | 20 | REGEX = { 21 | "YT_INITIAL_DATA": "ytInitialData = ({(?:(?:.|\n)*)?});" 22 | } 23 | 24 | def __init__(self, post_id, channel_id, author=None, content_text=None, backstage_attachment=None, vote_count=None, sponsor_only_badge=None, published_time_text=None, original_post=None): 25 | self.post_id = post_id 26 | self.channel_id = channel_id 27 | self.author = author 28 | self.content_text = content_text 29 | self.backstage_attachment = backstage_attachment 30 | self.vote_count = vote_count 31 | self.sponsor_only_badge = sponsor_only_badge 32 | self.published_time_text = published_time_text 33 | self.original_post = original_post 34 | 35 | self.first = True 36 | self.comments = [] 37 | self.comments_continuation_token = None 38 | self.click_tracking_params = None 39 | self.visitor_data = None 40 | self.session_index = "0" 41 | 42 | def as_json(self): 43 | return { 44 | "post_id": self.post_id, 45 | "channel_id": self.channel_id, 46 | "author": self.author, 47 | "content_text": self.content_text, 48 | "backstage_attachment": self.backstage_attachment, 49 | "vote_count": self.vote_count, 50 | "sponsor_only_badge": self.sponsor_only_badge, 51 | "original_post": self.original_post and self.original_post.as_json() 52 | } 53 | 54 | def get_published_string(self): 55 | return self.published_time_text 56 | 57 | @staticmethod 58 | def from_post_id(post_id, expire_after=0): 59 | headers = { 60 | "Accept-Language": "en-US,en;q=0.9", 61 | "Referer": Post.FORMAT_URLS["POST"].format(post_id) 62 | } 63 | 64 | # Add authorization header 65 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 66 | if "SAPISID" in current_cookies: 67 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 68 | 69 | post_url = Post.FORMAT_URLS["POST"].format(post_id) 70 | r = requests_cache.get(post_url, expire_after=expire_after, headers=headers) 71 | 72 | m = re.findall(Post.REGEX["YT_INITIAL_DATA"], r.text) 73 | data = json.loads(m[0]) 74 | community_tab = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0] 75 | community_tab_items = Post.get_items_from_community_tab(community_tab) 76 | 77 | post_data = community_tab_items[0]["backstagePostThreadRenderer"]["post"] 78 | 79 | post = Post.from_data(post_data) 80 | post.get_first_continuation_token(data) 81 | post.get_click_tracking_params(data) 82 | post.visitor_data = data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["visitorData"] 83 | post.session_index = str( 84 | safely_get_value_from_key(data, "responseContext", "webResponseContextExtensionData", "ytConfigData", "sessionIndex", default="") 85 | ) 86 | 87 | return post 88 | 89 | def __str__(self): 90 | return json.dumps(self.as_json(), indent=4) 91 | 92 | def __repr__(self): 93 | return self.__str__() 94 | 95 | def get_thumbnails(self): 96 | # Returns a list of the thumbnails in different resolutions of 97 | # all images present in the post 98 | thumbnails = [] 99 | 100 | if self.backstage_attachment is not None: 101 | renderer_key = list(self.backstage_attachment.keys())[0] 102 | 103 | if renderer_key == "videoRenderer": 104 | thumbnails = [self.backstage_attachment[renderer_key]["thumbnail"]["thumbnails"]] 105 | elif renderer_key == "backstageImageRenderer": 106 | thumbnails = [self.backstage_attachment[renderer_key]["image"]["thumbnails"]] 107 | elif renderer_key == "postMultiImageRenderer": 108 | thumbnails = [img["backstageImageRenderer"]["image"]["thumbnails"] for img in self.backstage_attachment[renderer_key]["images"]] 109 | elif renderer_key == "pollRenderer": 110 | print("[There is nothing implemented for polls]") 111 | thumbnails = [] 112 | else: 113 | raise Exception("There is no implementation for renderer_key={renderer_key} yet") 114 | 115 | return thumbnails 116 | 117 | def get_first_continuation_token(self, data): 118 | self.comments_continuation_token = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0]["tabRenderer"]["content"]["sectionListRenderer"][ 119 | "contents"][1]["itemSectionRenderer"]["contents"][0]["continuationItemRenderer"]["continuationEndpoint"]["continuationCommand"]["token"] 120 | 121 | def get_click_tracking_params(self, data): 122 | self.click_tracking_params = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0]["tabRenderer"]["content"]["sectionListRenderer"][ 123 | "contents"][1]["itemSectionRenderer"]["contents"][0]["continuationItemRenderer"]["continuationEndpoint"]["clickTrackingParams"] 124 | 125 | def get_click_tracking_params(self, data): 126 | self.click_tracking_params = data["contents"]["twoColumnBrowseResultsRenderer"]["tabs"][0]["tabRenderer"]["content"]["sectionListRenderer"]["contents"][ 127 | 1 128 | ]["itemSectionRenderer"]["contents"][0]["continuationItemRenderer"]["continuationEndpoint"]["clickTrackingParams"] 129 | 130 | def load_comments(self, expire_after=0): 131 | headers = { 132 | "Accept-Language": "en-US,en;q=0.9", 133 | "Referer": Post.FORMAT_URLS["POST"].format(self.post_id) 134 | } 135 | 136 | # Add authorization header 137 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 138 | if "SAPISID" in current_cookies: 139 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 140 | 141 | if self.comments_continuation_token is None: 142 | try: 143 | r = requests_cache.get(Post.FORMAT_URLS["POST"].format(self.post_id), expire_after=expire_after, headers=headers) 144 | 145 | m = re.findall(Post.REGEX["YT_INITIAL_DATA"], r.text) 146 | data = json.loads(m[0]) 147 | 148 | self.get_first_continuation_token(data) 149 | self.get_click_tracking_params(data) 150 | self.visitor_data = data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["visitorData"] 151 | self.session_index = str(data["responseContext"]["webResponseContextExtensionData"]["ytConfigData"]["sessionIndex"]) 152 | self.load_comments(expire_after=expire_after) 153 | except Exception as e: 154 | print("[Some non-expected exception, probably caused by requests...]") 155 | raise e 156 | elif self.comments_continuation_token is not False: 157 | headers.update( 158 | { 159 | "X-Goog-AuthUser": self.session_index, 160 | "X-Origin": "https://www.youtube.com", 161 | "X-Youtube-Client-Name": "1", 162 | "X-Youtube-Client-Version": CLIENT_VERSION 163 | } 164 | ) 165 | 166 | json_body = { 167 | "context": { 168 | "client": { 169 | "clientName": "WEB", 170 | "clientVersion": CLIENT_VERSION, 171 | "originalUrl": Post.FORMAT_URLS["POST"].format(self.post_id), 172 | "visitorData": self.visitor_data 173 | } 174 | }, 175 | "continuation": self.comments_continuation_token, 176 | "clickTracking": {"clickTrackingParams": self.click_tracking_params} 177 | } 178 | 179 | r = requests_cache.post(Post.FORMAT_URLS["BROWSE_ENDPOINT"], json=json_body, expire_after=expire_after, headers=headers) 180 | 181 | data = r.json() 182 | if self.first: 183 | if "continuationItems" not in data["onResponseReceivedEndpoints"][1]["reloadContinuationItemsCommand"]: 184 | # There are no comments 185 | continuation_items = [] 186 | else: 187 | append = data["onResponseReceivedEndpoints"][1]["reloadContinuationItemsCommand"] 188 | continuation_items = safely_get_value_from_key(append, "continuationItems", default=[]) 189 | self.first = False 190 | else: 191 | append = data["onResponseReceivedEndpoints"][0]["appendContinuationItemsAction"] 192 | continuation_items = safely_get_value_from_key(append, "continuationItems", default=[]) 193 | 194 | self.click_tracking_params = data["trackingParams"] 195 | self.append_comments_from_items(continuation_items) 196 | 197 | def append_comments_from_items(self, items): 198 | there_is_no_continuation_token = True 199 | for item in items: 200 | kind = list(item.keys())[0] 201 | 202 | if kind == "commentThreadRenderer": 203 | self.comments.append( 204 | Comment.from_data( 205 | item[kind]["comment"]["commentRenderer"], 206 | self.post_id, 207 | self.channel_id, 208 | safely_get_value_from_key( 209 | item[kind], 210 | "replies", 211 | "commentRepliesRenderer", 212 | "contents", 213 | 0, 214 | "continuationItemRenderer", 215 | "continuationEndpoint", 216 | "continuationCommand", 217 | "token" 218 | ), 219 | safely_get_value_from_key( 220 | item[kind], 221 | "replies", 222 | "commentRepliesRenderer", 223 | "contents", 224 | 0, 225 | "continuationItemRenderer", 226 | "continuationEndpoint", 227 | "clickTrackingParams" 228 | ), 229 | self.visitor_data, 230 | self.session_index 231 | ) 232 | ) 233 | elif kind == "continuationItemRenderer": 234 | self.comments_continuation_token = item[kind]["continuationEndpoint"]["continuationCommand"]["token"] 235 | there_is_no_continuation_token = False 236 | 237 | if there_is_no_continuation_token: 238 | self.comments_continuation_token = False 239 | 240 | def get_text(self): 241 | runs = safely_get_value_from_key(self.content_text, "runs", default=[]) 242 | 243 | if self.content_text is not None: 244 | return "\n".join([run["text"] for run in runs]) 245 | return None 246 | 247 | def get_create_comment_params(self): 248 | if self.channel_id is None or self.post_id is None: 249 | return None 250 | 251 | params = [ 252 | b"*\x02\b\x00P\x01\xA2\x01", 253 | len(self.post_id).to_bytes(1, "big"), 254 | self.post_id.encode(), 255 | b"\xAA\x01", 256 | len(self.channel_id).to_bytes(1, "big"), 257 | self.channel_id.encode(), 258 | ] 259 | 260 | params = urlsafe_b64encode(b"".join(params)).decode().replace("=", "%3D") 261 | 262 | return params 263 | 264 | def create_comment(self, comment_text): 265 | headers = { 266 | "Accept-Language": "en-US,en;q=0.9", 267 | "x-origin": "https://www.youtube.com" 268 | } 269 | 270 | current_cookies = dict_from_cookiejar(requests_cache.cookies) 271 | if "SAPISID" in current_cookies: 272 | headers["Authorization"] = get_auth_header(current_cookies["SAPISID"]) 273 | 274 | json_body = { 275 | "context": { 276 | "client": { 277 | "clientName": "WEB", 278 | "clientVersion": CLIENT_VERSION, 279 | }, 280 | }, 281 | "createCommentParams": self.get_create_comment_params(), 282 | "commentText": comment_text 283 | } 284 | 285 | r = requests_cache.post( 286 | Post.FORMAT_URLS["CREATE_COMMENT_ENDPOINT"], 287 | json=json_body, 288 | headers=headers 289 | ) 290 | 291 | try: 292 | data = r.json() 293 | comment_id = search_key("comment", data)[0][1]["commentRenderer"]["commentId"] 294 | 295 | return Comment.from_ids(comment_id, self.post_id, self.channel_id) 296 | except Exception as e: 297 | raise e 298 | 299 | @staticmethod 300 | def from_data(post_data): 301 | if "sharedPostRenderer" in post_data: 302 | data = post_data["sharedPostRenderer"] 303 | data["contentText"] = data.pop("content") 304 | data["authorText"] = data.pop("displayName") 305 | data["authorEndpoint"] = data.pop("endpoint") 306 | 307 | original_post_data = post_data["sharedPostRenderer"]["originalPost"] 308 | data["originalPost"] = Post.from_data(original_post_data) 309 | 310 | elif "backstagePostRenderer" in post_data: 311 | data = post_data["backstagePostRenderer"] 312 | else: 313 | raise NotImplementedError(f"[post_kind={list(post_data.keys())[0]} is not implemented yet!]") 314 | 315 | data["channelId"] = data["authorEndpoint"]["browseEndpoint"]["browseId"] 316 | 317 | # clean the author cause it's different here for some reason 318 | for item in data["authorText"]["runs"]: 319 | item["browseEndpoint"] = item["navigationEndpoint"]["browseEndpoint"] 320 | item["browseEndpoint"]["url"] = item["navigationEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"] 321 | item.pop("navigationEndpoint") 322 | data["authorEndpoint"]["browseId"] = data["authorEndpoint"]["browseEndpoint"]["browseId"] 323 | author_url = data["authorEndpoint"]["commandMetadata"]["webCommandMetadata"]["url"] 324 | data["authorEndpoint"]["url"] = author_url 325 | for value in ["clickTrackingParams", "commandMetadata", "browseEndpoint"]: 326 | data["authorEndpoint"].pop(value) 327 | 328 | post = Post( 329 | data["postId"], 330 | channel_id=data["channelId"], 331 | author={ 332 | "authorText": safely_get_value_from_key(data, "authorText"), 333 | "authorThumbnail": safely_get_value_from_key(data, "authorThumbnail"), 334 | "authorEndpoint": safely_get_value_from_key(data, "authorEndpoint") 335 | }, 336 | content_text=clean_content_text(safely_get_value_from_key(data, "contentText")), 337 | backstage_attachment=clean_backstage_attachment(safely_get_value_from_key(data, "backstageAttachment", default=None)), 338 | vote_count=safely_get_value_from_key(data, "voteCount"), 339 | sponsor_only_badge=safely_get_value_from_key(data, "sponsorsOnlyBadge", default=None), 340 | published_time_text=safely_get_value_from_key(data, "publishedTimeText", "runs", 0, "text", default=None), 341 | original_post=safely_get_value_from_key(data, "originalPost", default=None) 342 | ) 343 | 344 | post.raw_data = data 345 | 346 | return post 347 | 348 | @staticmethod 349 | def get_items_from_community_tab(tab): 350 | try: 351 | return tab["tabRenderer"]["content"]["sectionListRenderer"]["contents"][0]["itemSectionRenderer"]["contents"] 352 | except Exception as e: 353 | print("[Can't get the contents from the tab]") 354 | raise e 355 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/reply.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from .helpers.utils import safely_get_value_from_key 4 | 5 | 6 | class Reply(object): 7 | def __init__(self, reply_id, author=None, content_text=None, vote_count=None): 8 | self.reply_id = reply_id 9 | self.author = author 10 | self.content_text = content_text 11 | self.vote_count = vote_count 12 | 13 | def as_json(self): 14 | return {"reply_id": self.reply_id, "author": self.author, "content_text": self.content_text, "vote_count": self.vote_count} 15 | 16 | def __str__(self): 17 | return json.dumps(self.as_json(), indent=4) 18 | 19 | def __repr__(self): 20 | return self.__str__() 21 | 22 | def get_text(self): 23 | if self.content_text is not None: 24 | return "".join([run["text"] for run in self.content_text["runs"]]) 25 | return None 26 | 27 | @staticmethod 28 | def from_data(data): 29 | reply = Reply( 30 | data["commentId"], 31 | content_text=safely_get_value_from_key(data, "contentText"), 32 | author={ 33 | "authorText": safely_get_value_from_key(data, "authorText"), 34 | "authorThumbnail": safely_get_value_from_key(data, "authorThumbnail"), 35 | "authorEndpoint": safely_get_value_from_key(data, "authorEndpoint", "browseEndpoint"), 36 | "authorIsChannelOwner": safely_get_value_from_key(data, "authorIsChannelOwner"), 37 | "sponsorCommentBadge": safely_get_value_from_key(data, "sponsorCommentBadge") 38 | }, 39 | vote_count=safely_get_value_from_key(data, "voteCount") 40 | ) 41 | 42 | reply.raw_data = data 43 | 44 | return reply 45 | -------------------------------------------------------------------------------- /youtube-community-tab/src/youtube_community_tab/requests_handler.py: -------------------------------------------------------------------------------- 1 | import os 2 | from requests_cache import CachedSession 3 | 4 | dirname = os.path.dirname(__file__) 5 | CACHE_FILE_PATH = os.path.join(dirname, "requests_cache.sqlite") 6 | 7 | requests_cache = CachedSession(allowable_methods=("GET", "POST"), cache_name=CACHE_FILE_PATH) 8 | -------------------------------------------------------------------------------- /youtube-community-tab/tests/test_actions.py: -------------------------------------------------------------------------------- 1 | from http import cookiejar 2 | from youtube_community_tab.requests_handler import requests_cache 3 | from youtube_community_tab.helpers import search_key 4 | from youtube_community_tab import Post 5 | import time 6 | 7 | EXPIRATION_TIME = 24 * 60 * 60 # requests cache expiration 8 | 9 | cookie_jar = cookiejar.MozillaCookieJar("./cookies.txt") 10 | cookie_jar.load() 11 | requests_cache.cookies = cookie_jar 12 | 13 | 14 | def test_actions(): 15 | post = Post.from_post_id("UgkxpAbrgRG3trNwPVu9ipY7vALkJ_Q-c1lv") 16 | comment = post.create_comment(f"[Current timestamp: {time.time()}]") 17 | 18 | assert comment is not None 19 | 20 | r = comment.set_like_comment() 21 | s = search_key("status", r) 22 | 23 | assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED" 24 | 25 | r = comment.update_comment(f"[Edited][Current timestamp: {time.time()}]") 26 | s = search_key("status", r) 27 | 28 | assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED" 29 | 30 | r = comment.set_dislike_comment() 31 | s = search_key("status", r) 32 | 33 | assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED" 34 | 35 | r = comment.delete_comment() 36 | s = search_key("status", r) 37 | 38 | assert len(s) > 0 and s[0][1] == "STATUS_SUCCEEDED" 39 | 40 | 41 | if __name__ == "__main__": 42 | test_actions() 43 | -------------------------------------------------------------------------------- /youtube-community-tab/tests/test_community_tab.py: -------------------------------------------------------------------------------- 1 | from youtube_community_tab.community_tab import CommunityTab 2 | 3 | EXPIRATION_TIME = 24 * 60 * 60 # requests cache expiration 4 | 5 | 6 | def test_community_tab(): 7 | ct = CommunityTab("vsauce1") 8 | ct.load_posts(expire_after=EXPIRATION_TIME) 9 | 10 | num_posts = len(ct.posts) 11 | 12 | assert num_posts > 0 13 | assert ct.posts_continuation_token 14 | 15 | ct.load_posts(expire_after=EXPIRATION_TIME) 16 | num_posts_ = len(ct.posts) 17 | 18 | assert num_posts_ > num_posts 19 | 20 | post = ct.posts[-1] # Choose old post to raise probability of 'good' data 21 | post.load_comments(expire_after=EXPIRATION_TIME) 22 | 23 | num_comments = len(post.comments) 24 | 25 | assert num_comments > 0 26 | assert post.comments_continuation_token 27 | 28 | post.load_comments(expire_after=EXPIRATION_TIME) 29 | 30 | num_comments_ = len(post.comments) 31 | 32 | assert num_comments_ > num_comments 33 | 34 | replied_comments = list(filter(lambda x: x.replies_continuation_token, post.comments)) 35 | 36 | if len(replied_comments) > 0: 37 | comment = replied_comments[0] 38 | 39 | comment.load_replies(expire_after=EXPIRATION_TIME) 40 | 41 | assert len(comment.replies) > 0 42 | 43 | 44 | if __name__ == "__main__": 45 | test_community_tab() 46 | -------------------------------------------------------------------------------- /youtube-community-tab/tests/test_membership.py: -------------------------------------------------------------------------------- 1 | from http import cookiejar 2 | from youtube_community_tab.requests_handler import requests_cache 3 | from youtube_community_tab.community_tab import CommunityTab 4 | from youtube_community_tab import Post 5 | 6 | EXPIRATION_TIME = 24 * 60 * 60 # requests cache expiration 7 | 8 | cookie_jar = cookiejar.MozillaCookieJar("./cookies.txt") 9 | cookie_jar.load() 10 | requests_cache.cookies = cookie_jar 11 | 12 | 13 | def test_load_membership_posts(): 14 | ct = CommunityTab("UCMwGHR0BTZuLsmjY_NT5Pwg") 15 | ct.load_posts(expire_after=EXPIRATION_TIME) 16 | 17 | membership_post = None 18 | while ct.posts_continuation_token: 19 | for post in ct.posts: 20 | if post.sponsor_only_badge is not None: 21 | membership_post = post 22 | break 23 | 24 | if(membership_post is not None): 25 | break 26 | 27 | ct.load_posts(expire_after=EXPIRATION_TIME) 28 | 29 | assert(membership_post is not None) 30 | 31 | 32 | def test_membership_post(): 33 | post = Post.from_post_id("UgkxJYrBY-QqIt1ysrZY0ZP84SGJLWmDmtoU", expire_after=EXPIRATION_TIME) 34 | 35 | # This post can be edited, so this test can fail in the future 36 | post_text = post.get_text() 37 | 38 | expected_text = "Cheeeeeeeeeeeeeeeese\nAm I bored? I don't know.... nyeh 😺" 39 | 40 | assert post_text == expected_text 41 | 42 | post.load_comments(expire_after=EXPIRATION_TIME) 43 | num_comments = len(post.comments) 44 | 45 | assert num_comments > 0 46 | assert post.comments_continuation_token 47 | 48 | post.load_comments(expire_after=EXPIRATION_TIME) 49 | num_comments_ = len(post.comments) 50 | 51 | assert num_comments_ > num_comments 52 | 53 | replied_comments = list(filter(lambda x: x.replies_continuation_token, post.comments)) 54 | 55 | if len(replied_comments) > 0: 56 | comment = replied_comments[0] 57 | 58 | comment.load_replies(expire_after=EXPIRATION_TIME) 59 | 60 | assert len(comment.replies) > 0 61 | 62 | 63 | if __name__ == "__main__": 64 | test_load_membership_posts() 65 | test_membership_post() 66 | -------------------------------------------------------------------------------- /youtube-community-tab/tests/test_post.py: -------------------------------------------------------------------------------- 1 | from youtube_community_tab.post import Post 2 | 3 | EXPIRATION_TIME = 24 * 60 * 60 # requests cache expiration 4 | 5 | 6 | def test_post(): 7 | post = Post.from_post_id("UgznJEQUR0fJzoMlS2Z4AaABCQ", expire_after=EXPIRATION_TIME) 8 | 9 | # This post can be edited, so this test can fail in the future 10 | post_text = post.get_text() 11 | expected_text = "Vsauce is 11 years old today!!!!" 12 | 13 | assert post_text == expected_text 14 | 15 | post.load_comments(expire_after=EXPIRATION_TIME) 16 | num_comments = len(post.comments) 17 | 18 | assert num_comments > 0 19 | assert post.comments_continuation_token 20 | 21 | post.load_comments(expire_after=EXPIRATION_TIME) 22 | num_comments_ = len(post.comments) 23 | 24 | assert num_comments_ > num_comments 25 | 26 | replied_comments = list(filter(lambda x: x.replies_continuation_token, post.comments)) 27 | 28 | if len(replied_comments) > 0: 29 | comment = replied_comments[0] 30 | 31 | comment.load_replies(expire_after=EXPIRATION_TIME) 32 | 33 | assert len(comment.replies) > 0 34 | 35 | 36 | if __name__ == "__main__": 37 | test_post() 38 | -------------------------------------------------------------------------------- /ytct.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from datetime import datetime 4 | from http import cookiejar 5 | import json 6 | import os 7 | import re 8 | import requests 9 | import sys 10 | import urllib.parse as urlparse 11 | from youtube_community_tab.requests_handler import requests_cache 12 | from youtube_community_tab.post import Post 13 | from youtube_community_tab.community_tab import CommunityTab 14 | 15 | POST_REGEX=r"^(?:(?:https?:\/\/)?(?:.*?\.)?(?:youtube\.com\/)((?:channel\/UC[a-zA-Z0-9_-]+\/community\?lb=)|post\/))?(?PUg[a-zA-Z0-9_-]+)(.*)?$" 16 | CHANNEL_REGEX=r"^(?:(?:https?:\/\/)?(?:.*?\.)?(?:youtube\.com\/))((?P@[a-zA-Z0-9_-]+)|((channel\/)?(?PUC[a-zA-Z0-9_-]+)))(?:\/.*)?$" 17 | HANDLE_TO_ID_REGEX=r"\"header\":\{\"c4TabbedHeaderRenderer\":\{\"channelId\":\"(?PUC[a-zA-Z0-9_-]+)\"" 18 | POST_DATE_REGEX=r"(?P[0-9]{1,2}) (?P(second|minute|hour|day|week|month|year))s? ago(?P \(edited\))?$" 19 | CLEAN_FILENAME_KINDA=r"[^\w\-_\. \[\]\(\)]" 20 | BLOCK_SIZE = 1024 21 | TIME_FACTORS={ 22 | "second": 1, 23 | "minute": 60, 24 | "hour": 60 * 60, 25 | "day": 60 * 60 * 24, 26 | "week": 60 * 60 * 24 * 7, # beyond 28 days it becomes 1 month ago 27 | "year": 60 * 60 * 24 * 365 28 | } 29 | 30 | args = None 31 | 32 | def get_arguments(): 33 | parser.add_argument("--cookies", metavar="COOKIES FILE", type=str, help="path to a Netscape format cookies file where cookies will be read from/written to") 34 | parser.add_argument("-d", "--directory", type=str, help="save directory (defaults to current)", default=os.getcwd()) 35 | parser.add_argument("--post-archive", metavar="FILE", type=str, help="download only posts not listed in the archive file and record the IDs of newly downloaded posts") 36 | parser.add_argument("--dates", action="store_true", help="write information about the post publish date") 37 | parser.add_argument("-r", "--reverse", action="store_true", help="download posts from oldest to newest") 38 | parser.add_argument("links", metavar="CHANNEL", nargs="*", help="youtube channel or community post link/id") 39 | parser.add_argument("--skip-download", action="store_true", help="skip downloading posts, intended for writing log") 40 | return parser.parse_args() 41 | 42 | def use_default_cookies(): 43 | requests_cache.cookies.set( 44 | 'SOCS', 45 | 'CAESNQgDEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjIwNzA1LjE2X3AwGgJwdCACGgYIgOedlgY', 46 | domain='.youtube.com', 47 | path='/' 48 | ) 49 | requests_cache.cookies.set( 50 | 'CONSENT', 51 | 'PENDING+917', 52 | domain='.youtube.com', 53 | path='/' 54 | ) 55 | 56 | def use_cookies(cookie_jar_path): 57 | cookie_jar = cookiejar.MozillaCookieJar(cookie_jar_path) 58 | try: 59 | cookie_jar.load() 60 | print_log("ytct", f"loaded cookies from {cookie_jar_path}") 61 | except FileNotFoundError: 62 | use_default_cookies() 63 | print_log("ytct", f"could not find cookies file {cookie_jar_path}, continuing without cookies...") 64 | return 65 | except (cookiejar.LoadError, OSError) as e: 66 | use_default_cookies() 67 | print_log("ytct", f"{e}") 68 | print_log("ytct", f"failed to load cookies from {cookie_jar_path}, continuing without cookies") 69 | return 70 | requests_cache.cookies = cookie_jar 71 | 72 | def get_channel_id_from_handle(channel_handle): 73 | handle_url = f"https://youtube.com/{channel_handle}" 74 | channel_home_r = requests_cache.get(handle_url) 75 | if not channel_home_r.ok: 76 | print_log("ytct", f"failed to convert channel handle to channel id, no response from {handle_url}") 77 | sys.exit(1) 78 | channel_home = channel_home_r.text 79 | channel_id_m = re.search(HANDLE_TO_ID_REGEX, channel_home) 80 | channel_id = channel_id_m.group("channel_id") 81 | if not channel_id: 82 | print_log("ytct", f"failed to convert channel handle to channel id, data format may have changed") 83 | sys.exit(1) 84 | return channel_id 85 | 86 | def get_post(post_id, post_archive): 87 | if post_archive: 88 | with open(post_archive, "r") as archive_file: 89 | skip_ids = archive_file.read().splitlines() 90 | if post_id in skip_ids: 91 | print_log(f"post:{post_id}", f"already recorded in archive") 92 | return 93 | post = Post.from_post_id(post_id) 94 | handle_post(post) 95 | if post_archive: 96 | with open(post_archive, "a") as archive_file: 97 | archive_file.write(f"{post_id}\n") 98 | 99 | def get_channel_posts(channel_id, post_archive): 100 | ct = CommunityTab(channel_id) 101 | page_count = 1 102 | print_log("community tab", f"getting posts from community tab (page {page_count})", "\r") 103 | ct.load_posts(0) 104 | while(ct.posts_continuation_token): 105 | page_count += 1 106 | print_log("community tab", f"getting posts from community tab (page {page_count})", "\r") 107 | ct.load_posts(0) 108 | print_log("community tab", f"getting posts from community tab (page {page_count})") 109 | print_log("community tab", f"found {len(ct.posts)} posts") 110 | # only read the archive once 111 | skip_ids = [] 112 | if post_archive: 113 | with open(post_archive, "r") as archive_file: 114 | skip_ids = archive_file.read().splitlines() 115 | if args.reverse: 116 | ct.posts = reversed(ct.posts) 117 | for post in ct.posts: 118 | if len(skip_ids) > 0 and post.post_id in skip_ids: 119 | print_log(f"post:{post.post_id}", f"already recorded in archive") 120 | continue 121 | if not args.skip_download: 122 | handle_post(post) 123 | if post_archive: 124 | with open(post_archive, "a") as archive_file: 125 | archive_file.write(f"{post.post_id}\n") 126 | 127 | def handle_post(post): 128 | post_j = post.as_json() 129 | if post.original_post is not None: 130 | if args.dates: 131 | post_j["original_post"]["_published"] = get_timestamp_metadata(post.original_post) 132 | handle_post(post.original_post) 133 | component = f"post:{post.post_id}" 134 | post_file_name = f"{post.post_id}" 135 | post_file_dir = os.path.join(args.directory) 136 | post_file_path = os.path.join(post_file_dir, post_file_name) 137 | if args.dates: 138 | timestamp_info = get_timestamp_metadata(post) 139 | post_j["_published"] = timestamp_info 140 | try: 141 | if not os.path.isdir(post_file_dir): 142 | os.makedirs(post_file_dir) 143 | if os.path.isfile(f"{post_file_path}.json.tmp"): 144 | os.remove(f"{post_file_path}.json.tmp") 145 | print_log(component, f"writing {post_file_name}.json") 146 | with open(f"{post_file_path}.json.tmp", "w", encoding='utf8') as post_file: 147 | post_file.write(json.dumps(post_j, ensure_ascii=False)) 148 | if os.path.isfile(f"{post_file_path}.json"): 149 | os.remove(f"{post_file_path}.json") 150 | os.rename(f"{post_file_path}.json.tmp", f"{post_file_path}.json") 151 | except Exception as e: 152 | print_log(component, f"failed to write file {post_file_path}") 153 | print_log(component, str(e)) 154 | if post.backstage_attachment: 155 | handle_post_attachments(component, post.backstage_attachment, post_file_path) 156 | 157 | def get_timestamp_metadata(post): 158 | timestamp_obj = {} 159 | # last updated time 160 | timestamp_obj["lastUpdatedTimestamp"] = int(datetime.utcnow().timestamp()) 161 | # string as it appears on YouTube 162 | timestamp_obj["lastPublishedString"] = post.get_published_string() 163 | return timestamp_obj 164 | 165 | def handle_post_timestamp(post, path): 166 | timestamp_obj = get_timestamp_metadata(post) 167 | # code removed for the time being to prevent trashing files of inexperienced users 168 | # the closest UTC timestamp, and the seconds difference from the furthest UTC timestamp 169 | # diff_to_nearest_possible_date, timestamp_obj["timestampAccuracy"], timestamp_obj["is_edited"] = get_time_diff_from_text(timestamp_obj["lastPublishedString"]) 170 | # if diff_to_nearest_possible_date and timestamp_obj["timestampAccuracy"]: 171 | # timestamp_obj["closestTimestamp"] = timestamp_obj["lastUpdatedTimestamp"] - diff_to_nearest_possible_date 172 | # if os.path.isfile(f"{path}.json"): 173 | # try: 174 | # with open(f"{path}.json", "r") as previous_post_file: 175 | # previous_post_j = json.load(previous_post_file) 176 | # if "_published" in previous_post_j: 177 | # previous_timestamp_obj = previous_post_j["_published"] 178 | # diff_since_last_update = timestamp_obj["lastUpdatedTimestamp"] - previous_timestamp_obj["lastUpdatedTimestamp"] 179 | # if previous_timestamp_obj["lastPublishedString"] == timestamp_obj["lastPublishedString"]: 180 | # # update accuracy based on time between current and last update 181 | # timestamp_obj["timestampAccuracy"] = previous_timestamp_obj["timestampAccuracy"] - diff_since_last_update 182 | # elif diff_since_last_update < previous_timestamp_obj["timestampAccuracy"]: 183 | # # time between change in update is less than previous accuracy, should be safe to change 184 | # # i.e. if you save a post 3 days after publish, accuracy is 72-96 hours 185 | # # if you then update 364 days after publish, and update again 1 year after publish 186 | # # the diff since last update is 24 hours, which is better than before 187 | # timestamp_obj["timestampAccuracy"] = diff_since_last_update 188 | # else: 189 | # # keep previous accuracy 190 | # timestamp_obj["timestampAccuracy"] = previous_timestamp_obj["timestampAccuracy"] 191 | # if previous_timestamp_obj["closestTimestamp"] < timestamp_obj["closestTimestamp"]: 192 | # # if closest timestamp is not better than previous, keep previous 193 | # timestamp_obj["closestTimestamp"] = previous_timestamp_obj["closestTimestamp"] 194 | # except Exception as e: 195 | # print_log("community post", f"failed to open previously downloaded post {post.post_id}") 196 | # print_log("community post", str(e)) 197 | 198 | def get_time_diff_from_text(published_text): 199 | post_date_m = re.search(POST_DATE_REGEX, published_text) 200 | if post_date_m: 201 | mag = int(post_date_m.group("magnitude")) 202 | unit = post_date_m.group("unit") 203 | delta_secs = 0 204 | accuracy = 0 205 | if unit == "month": 206 | # absolute madness beyond this point 207 | delta_secs += TIME_FACTORS["day"] * 28 208 | if mag != 1: 209 | delta_secs += (mag - 1) * TIME_FACTORS["day"] * 30.4 210 | accuracy = TIME_FACTORS["day"] * 31 - 1 211 | else: 212 | delta_secs = mag * TIME_FACTORS[unit] 213 | accuracy = TIME_FACTORS[unit] - 1 214 | edited = False 215 | if post_date_m.group("edited"): 216 | edited = True 217 | return (delta_secs, accuracy, edited) 218 | else: 219 | print_log("community post:date", f"could not parse '{published_text}', open an issue?") 220 | return (None, None, None) 221 | 222 | def handle_post_attachments(component, attachment, path): 223 | if "postMultiImageRenderer" in attachment: 224 | num_images = len(attachment["postMultiImageRenderer"]["images"]) 225 | print_log(component, f"downloading {num_images} attached images") 226 | for image_i in range(0, num_images): 227 | handle_post_attachments(component, attachment["postMultiImageRenderer"]["images"][image_i], f"{path}_{image_i}") 228 | elif "backstageImageRenderer" in attachment: 229 | print_log(component, f"downloading image") 230 | image_url = attachment["backstageImageRenderer"]["image"]["thumbnails"][-1]["url"].split("=", 1)[0] + "=s0?imgmax=0" 231 | image_r = requests.get(image_url, stream=True, allow_redirects=True) 232 | image_ext = image_r.headers["Content-Type"].split("/", 1)[1].replace("jpeg", "jpg") 233 | image_path = f"{path}.{image_ext}" 234 | if not os.path.isfile(image_path): 235 | if os.path.isfile(f"{image_path}.tmp"): 236 | os.remove(f"{image_path}.tmp") 237 | with open(f"{image_path}.tmp", "wb") as image_file: 238 | for chunk in image_r.iter_content(BLOCK_SIZE): 239 | image_file.write(chunk) 240 | os.rename(f"{image_path}.tmp", image_path) 241 | else: 242 | print_log(component, "image already downloaded, skipping") 243 | elif "videoRenderer" in attachment: 244 | thumb_url = None 245 | if "videoId" in attachment["videoRenderer"]: 246 | video_id = attachment["videoRenderer"]["videoId"] 247 | thumb_url = f"https://i.ytimg.com/vi/{video_id}/maxresdefault.jpg" 248 | elif "thumbnail" in attachment["videoRenderer"]: 249 | thumb_url = urlparse.urljoin(attachment["videoRenderer"]["thumbnail"]["thumbnails"][-1]["url"], "maxresdefault.jpg") 250 | print_log(component, "could not get video ID, video may be private or deleted") 251 | if thumb_url: 252 | print_log(component, f"downloading thumbnail") 253 | thumb_r = requests.get(thumb_url, stream=True, allow_redirects=True) 254 | thumb_ext = thumb_r.headers["Content-Type"].split("/", 1)[1].replace("jpeg", "jpg") 255 | thumb_path = f"{path}_thumb.{thumb_ext}" 256 | if not os.path.isfile(thumb_path): 257 | if os.path.isfile(f"{thumb_path}.tmp"): 258 | os.remove(f"{thumb_path}.tmp") 259 | with open(f"{thumb_path}.tmp", "wb") as thumb_file: 260 | for chunk in thumb_r.iter_content(BLOCK_SIZE): 261 | thumb_file.write(chunk) 262 | os.rename(f"{thumb_path}.tmp", thumb_path) 263 | else: 264 | print_log(component, "thumbnail already downloaded, skipping") 265 | else: 266 | print_log(component, "could not get video thumbnail url for post") 267 | 268 | def clean_name(text): 269 | return re.sub(CLEAN_FILENAME_KINDA, "_", text) 270 | 271 | def print_log(component, message, end="\n"): 272 | print(f"[{component}] {message}", end=end) 273 | 274 | if __name__ == "__main__": 275 | parser = argparse.ArgumentParser() 276 | args = get_arguments() 277 | # set cookies for retrieving posts that need auth 278 | if args.cookies: 279 | use_cookies(args.cookies) 280 | else: 281 | use_default_cookies() 282 | usable_archive = None 283 | if args.post_archive: 284 | #making sure the directory of the log exists, create if necessary 285 | log_path = os.path.dirname(args.post_archive) 286 | if not os.path.isdir(log_path): 287 | try: 288 | os.makedirs(log_path) 289 | except: 290 | print_log("ytct", "failed to create log directory") 291 | 292 | try: 293 | open(args.post_archive, "a") 294 | usable_archive = args.post_archive 295 | except: 296 | print_log("ytct", f"cannot write to the archive file {args.post_archive}, continuing...") 297 | if not os.path.isdir(args.directory): 298 | try: 299 | os.makedirs(args.directory) 300 | except: 301 | print_log("ytct", "failed to create output directory") 302 | sys.exit(1) 303 | for link in args.links: 304 | post_id_m = re.search(POST_REGEX, link) 305 | channel_id_m = re.search(CHANNEL_REGEX, link) 306 | if post_id_m: 307 | post_id = post_id_m.group("post_id") 308 | get_post(post_id, usable_archive) 309 | elif channel_id_m: 310 | channel_handle = channel_id_m.group("channel_handle") 311 | if channel_handle: 312 | channel_id = get_channel_id_from_handle(channel_handle) 313 | else: 314 | channel_id = channel_id_m.group("channel_id") 315 | get_channel_posts(channel_id, usable_archive) 316 | else: 317 | print_log("ytct", f"could not parse link/id {link}") 318 | print_log("ytct", "finished") 319 | --------------------------------------------------------------------------------