├── Changelog.md ├── setup.py ├── LICENSE ├── ytdump.py ├── README.md └── yttool.py /Changelog.md: -------------------------------------------------------------------------------- 1 | 1.0.11 - 2021-05-14 2 | * fixed commentlist problem with some videos. 3 | 4 | 1.0.10 - 2021-04-25 5 | * --livechat now monitors the livechat 6 | 7 | 1.0.9 - 2021-04-25 8 | * all options work again, after changes to the youtube api. 9 | 10 | 1.0.8 - 2021-03-29 11 | * made comments work again. 12 | 13 | 1.0.7 - 2021-02-16 14 | * added optional socks / tor proxy 15 | * output text as utf-8. 16 | 17 | 1.0.6 - 2021-01-13 18 | * repaired 'chat replay' 19 | * repaired -l option 20 | 21 | 1.0.5 - 2020-08-17 22 | * fixed issue with 'too large request' error, for video's with lots of comments. 23 | 24 | 1.0.4 - 2020-06-18 initial release 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | setup( 3 | name = "youtube_tool", 4 | version = "1.0.11", 5 | entry_points = { 6 | 'console_scripts': ['yttool=yttool:main'], 7 | }, 8 | py_modules=['yttool'], 9 | author = "Willem Hengeveld", 10 | author_email = "itsme@xs4all.nl", 11 | description = "Extract information from youtube video's", 12 | long_description=""" 13 | Commandline tool which can extract comments, subtitles or livechat 14 | content from a youtube video. It can also list all video's 15 | in a playlist, or from a search result. 16 | """, 17 | 18 | license = "MIT", 19 | keywords = "youtube commandline", 20 | url = "https://github.com/nlitsme/youtube_tool/", 21 | classifiers = [ 22 | 'Environment :: Console', 23 | 'Intended Audience :: End Users/Desktop', 24 | 'Intended Audience :: Developers', 25 | 'License :: OSI Approved :: MIT License', 26 | 'Operating System :: OS Independent', 27 | 'Programming Language :: Python :: 3', 28 | 'Topic :: Utilities', 29 | ], 30 | python_requires = '>=3.8', 31 | ) 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Willem Hengeveld 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ytdump.py: -------------------------------------------------------------------------------- 1 | """ 2 | A tool for investigating youtube json dictionaries. 3 | 4 | This tries to pretty print the rather complex json dictionaries youtube uses. 5 | You can pass the json either through stdin, pass it as a string on the commandline, 6 | or as a filename on the commandline. 7 | 8 | 9 | Author: Willem Hengeveld 10 | """ 11 | import json 12 | import sys 13 | import os.path 14 | 15 | 16 | def extractruns(runs): 17 | """ 18 | Extract all text in a 'runs' dictionary. 19 | """ 20 | text = [] 21 | for r in runs: 22 | text.append(r.get('text')) 23 | return "".join(text) 24 | 25 | 26 | def pathendswith(path, *end): 27 | """ 28 | A helper for matching paths in the json dictionary. 29 | """ 30 | if len(end) > len(path): 31 | return False 32 | for a, b in zip(path[-len(end):], end): 33 | if type(b)==type: 34 | if type(a)!=b: 35 | return False 36 | elif type(b)==int: 37 | if a != b: 38 | return False 39 | elif type(a)==int: 40 | return False 41 | elif b[:1] == '*': 42 | if not a.endswith(b[1:]): 43 | return False 44 | else: 45 | if a != b: 46 | return False 47 | return True 48 | 49 | 50 | def processRender(j, path): 51 | """ 52 | print all properties directly under 'j' 53 | """ 54 | info = [] 55 | for k, item in j.items(): 56 | if type(item) in (int, float, str, bool): 57 | info.append((k, item)) 58 | elif type(item) != dict: 59 | pass 60 | elif runs := item.get('runs'): 61 | info.append((k, extractruns(runs))) 62 | elif text := item.get("simpleText"): 63 | info.append((k, text)) 64 | indent = " " * len(path) 65 | print(indent, "==== %s" % (path[::-1],)) 66 | for k, v in info: 67 | print(indent, "| %-20s : %s" % (k, v)) 68 | 69 | 70 | def process(j, path=[]): 71 | """ 72 | recursively process the json dictionary passed in 'j'. 73 | 74 | Printing all 'Renderer' dictionaries in detail, indented according to path length. 75 | 76 | The path is the list of keys needed to find the current entry from the top. 77 | """ 78 | if path: 79 | if pathendswith(path, "*Renderer"): 80 | if type(j)!=dict: 81 | print("WARNING: Renderer without dict", path) 82 | else: 83 | processRender(j, path) 84 | elif pathendswith(path, "continuations"): 85 | if not pathendswith(path, "*Renderer", "continuations"): 86 | print("WARNING: continuations without renderer", path) 87 | pass 88 | elif pathendswith(path, "nextContinuationData"): 89 | if not pathendswith(path, "continuations", int, "nextContinuationData"): 90 | print("WARNING: nextContinuationData without continuations", path) 91 | pass 92 | elif pathendswith(path, "continuation"): 93 | if not pathendswith(path, "nextContinuationData", "continuation"): 94 | print("WARNING: continuation without nextContinuationData", path) 95 | pass 96 | 97 | if type(j) == list: 98 | for i, item in enumerate(j): 99 | process(item, path + [i]) 100 | elif type(j) == dict: 101 | for k, item in j.items(): 102 | process(item, path + [k]) 103 | elif type(j) in (int, float, str, bool, type(None)): 104 | pass 105 | else: 106 | print("WARNING: unexpected type", type(j), j) 107 | 108 | 109 | def main(): 110 | if len(sys.argv)==1: 111 | data = sys.stdin.read() 112 | j = json.loads(data) 113 | process(j) 114 | else: 115 | for arg in sys.argv[1:]: 116 | if os.path.exists(arg): 117 | try: 118 | with open(arg, "r") as fh: 119 | print("==>", arg, "<==") 120 | j = json.load(fh) 121 | process(j) 122 | except Exception as e: 123 | print("ERROR reading %s: %s" % (arg, e)) 124 | else: 125 | print("==> json commandline argument <==") 126 | j = json.loads(arg) 127 | process(j) 128 | 129 | if __name__ == '__main__': 130 | main() 131 | 132 | 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # yttool 2 | 3 | A tool for extracting info from youtube: 4 | * print all comments for a video 5 | * print a video's description + info 6 | * print all subtitles for a video 7 | * print out an entire livechat replay. 8 | * list all items in a playlist 9 | * list all videos for a channel or user 10 | * list all video's matching a query 11 | 12 | # install 13 | 14 | You can install this from the official python repository using `pip`: 15 | 16 | pip3 install youtube-tool 17 | 18 | This will add a command `yttool` to your python binaries directory, 19 | and probably also to your search path. So you can run this like: 20 | 21 | yttool ....arguments.... 22 | 23 | Note: depending on your local python installation(s), you may have to type 24 | one of `pip`, `pip3`, or maybe even: `pip3.8`. 25 | 26 | 27 | You can also 'install' this by executing the `yttool.py` file directly from 28 | the source directory: 29 | 30 | python3 yttool.py ....arguments... 31 | 32 | 33 | # requirements 34 | 35 | This script needs python 3.8 or later to run. 36 | The python3.8 specific feature I am using is the new `:=` walrus operator. 37 | 38 | 39 | # usage 40 | 41 | ## list all subtitles attached to a video. 42 | 43 | This will output the subtitles in all available languages. 44 | 45 | yttool --subtitles https://www.youtube.com/watch?v=bJOuzqu3MUQ 46 | 47 | Or list the subtitles prefixed with timestamps 48 | 49 | yttool -v --subtitles https://www.youtube.com/watch?v=bJOuzqu3MUQ 50 | 51 | 52 | You can also extract the subtitles in a format suitable for 53 | creating `.srt` subtitle files: 54 | 55 | yttool --srt --subtitles https://www.youtube.com/watch?v=bJOuzqu3MUQ 56 | 57 | 58 | Or you can filter by language, for example only output the english subtitles: 59 | 60 | yttool --language en --subtitles https://www.youtube.com/watch?v=0xY06PT5JDE 61 | 62 | Or only output the automatically generated subtitles: 63 | 64 | yttool --language asr --subtitles https://www.youtube.com/watch?v=0xY06PT5JDE 65 | 66 | 67 | ## comments 68 | 69 | List all the comments for this Numberphile video: 70 | 71 | yttool --comments https://www.youtube.com/watch?v=bJOuzqu3MUQ 72 | 73 | 74 | ## livechat replay 75 | 76 | Print out an entire livechat replay: 77 | 78 | yttool --replay https://www.youtube.com/watch?v=lE0u_jIDh0E 79 | 80 | ## follow an active livechat 81 | 82 | Note: this does not yet work! 83 | 84 | Print messages from a livechat as they come: 85 | 86 | yttool --livechat https://www.youtube.com/watch?v=EEIk7gwjgIM 87 | 88 | 89 | ## list a playlist contents. 90 | 91 | List all the video's contained in this System of a Down playlist: 92 | 93 | yttool --playlist https://www.youtube.com/playlist?list=PLSKnqXUHTaSdXuK8Z2d-hXLFtJbRZwPtJ 94 | 95 | The output will look like this: 96 | 97 | CSvFpBOe8eY - System Of A Down - Chop Suey! (Official Video) 98 | zUzd9KyIDrM - System Of A Down - B.Y.O.B. (Official Video) 99 | L-iepu3EtyE - System Of A Down - Aerials (Official Video) 100 | iywaBOMvYLI - System Of A Down - Toxicity (Official Video) 101 | DnGdoEa1tPg - System Of A Down - Lonely Day (Official Video) 102 | LoheCz4t2xc - System Of A Down - Hypnotize (Official Video) 103 | 5vBGOrI6yBk - System Of A Down - Sugar (Official Video) 104 | SqZNMvIEHhs - System Of A Down - Spiders (Official Video) 105 | ENBv2i88g6Y - System Of A Down - Question! (Official Video) 106 | bE2r7r7VVic - System Of A Down - Boom! (Official Video) 107 | F46r-_jPPHY - System Of A Down - War? (Official Video) 108 | 109 | The first 11 characters are the video id, you can load the corresponding video 110 | by typing: `https://www.youtube.com/watch?v=5vBGOrI6yBk` in your browser's URL bar. 111 | 112 | 113 | Or list all video's from a channel: 114 | 115 | yttool -l https://www.youtube.com/channel/UCoxcjq-8xIDTYp3uz647V5A 116 | 117 | Or when you don't know the channelid, you can get the same with the username: 118 | 119 | yttool -l https://www.youtube.com/user/numberphile 120 | 121 | 122 | ## list query results 123 | 124 | This: 125 | 126 | yttool -q somequery 127 | 128 | Will list first couple of the video's matching that query. 129 | 130 | ## Just the id's 131 | 132 | You can also call yttool with only the video id as an argument: 133 | 134 | yttool --info CSvFpBOe8eY 135 | 136 | 137 | # How to use with a proxy? 138 | 139 | For example if you would like to use TOR, you would do this: 140 | 141 | yttool --proxy socks5://localhost:9050 --info https://www.youtube.com/watch?v=Ll-_LV9U1tA 142 | 143 | Note that setting a socks proxy via the `https_proxy` environment variable does NOT work very well with python's urllib library. 144 | 145 | 146 | # How does it work? 147 | 148 | This script does not use the official youtube API, instead, it uses youtube's internal api, which is 149 | what is used on the youtube website itself. This does mean there is no guarantee that this script 150 | will keep working without maintenance. Youtube will keep changing the way it works internally. 151 | So I will need to keep updating this script. 152 | 153 | The advantage of using the internal API, is that there are apparently no limits to how many requests you 154 | can do. And you don't have to bother with any kind of registration. 155 | 156 | 157 | These are the main internal api urls I am using: 158 | 159 | - comments: `https://www.youtube.com/comment_service_ajax` 160 | - livechat: `https://www.youtube.com/live_chat_replay/get_live_chat_replay` 161 | - search: `https://www.youtube.com/youtubei/v1/search` 162 | - playlists: `https://www.youtube.com/browse_ajax` 163 | 164 | Also, you can get youtube to respond with json instead of html by adding a `&pbj=1` argument to most urls, 165 | and add http headers: `x-youtube-client-name: 1` and `x-youtube-client-version: 2.20200603.01.00` to your request. 166 | Also the user-agent header needs to be of the right format, see my script for a working example. 167 | 168 | Then, for search you need to add a `innertubeapikey`. Which I have currently hardcoded in my script, as i did with the client-version. 169 | A future improvement would be to automatically extract these from the current youtube front page. 170 | 171 | 172 | # Note about the structure of youtube video id's 173 | 174 | Youtube's id's are structured in several ways: 175 | 176 | A videoid is 11 characters long, when decoded using base64, this results in exactly 8 bytes. 177 | The last character of a videoid can only be: `048AEIMQUYcgkosw` --> 10x6+4 = 64 bits 178 | 179 | A playlist id is either 24 or 34 characters long, and has the following format: 180 | 181 | ### id's containing a 'playlist' id. 182 | 183 | * "PL" or "EC" -- custom playlist, or educational playlist. 184 | * "BP" and "SP" also seem to have some kind of function. 185 | * playlistid can be: 186 | * either 32 base64 characters --> either a 6x32 = 192 bits 187 | * or or 16 hex characters --> either a 16x4 = 64 bits 188 | * www.youtube.com/playlist?list=PL 189 | * www.youtube.com/course?list=EC 190 | * no longer works very well, the layout of the `course` page is broken, 191 | with lots of overlapping text. 192 | 193 | ### id's containing a channel id 194 | 195 | A channel-id is 22 base64 characters, with the last character one of: `AQgw`, so this decodes to 21x6+2 = 128 bits 196 | 197 | * "UC" -- user channel 198 | * www.youtube.com/channel/UC 199 | * "PU" -- popular uploads playlist 200 | * quick way to load: www.youtube.com/watch?v=xxxxxxxxxxx&list=PU 201 | * "UU" -- user uploads playlist 202 | * quick way to load: www.youtube.com/watch?v=xxxxxxxxxxx&list=UU 203 | * "LL" -- liked video's for user 204 | * quick way to load: www.youtube.com/watch?v=xxxxxxxxxxx&list=LL 205 | * or www.youtube.com/playlist?list=LL 206 | * "FL" -- favorites 207 | * www.youtube.com/watch?v=xxxxxxxxxxx&list=FL 208 | * "RDCMUC" -- mix for channel 209 | * www.youtube.com/watch?v=xxxxxxxxxxx&list=RDCMUC 210 | 211 | * prefixes CL, EL, MQ, TT, WL also seem to have a special meaning 212 | 213 | ### Other playlist types 214 | 215 | These take 216 | * "TLGG<22chars>" -- temporary list - redir from `watch_videos` 217 | * When decoded, the last 8 bytes are digits for the "ddmmyyyy" date. 218 | * "RDEM<22chars>" -- radio channel 219 | * 22chars is NOT a channel-id 220 | * www.youtube.com/watch?v=xxxxxxxxxxx&list=RDEM<22chars> 221 | * "RD" -- mix for a specific video. 222 | * "OLAK5uy_<33chars>" -- album playlist. 223 | * id's start with: `klmn` : 0b1001xx 224 | * id's ends with: `AEIMQUYcgkosw048` --> 2 + 31x6 + 4 = 192 bits 225 | * www.youtube.com/playlist?list=OLAK5uy_<33chars> 226 | * "WL" -- 'watch later' 227 | * www.youtube.com/playlist?list=WL 228 | * www.youtube.com/watch?v=xxxxxxxxxxx&list=WL 229 | * "UL" -- channel video mix 230 | * www.youtube.com/watch?v=<11charsvidid>&list=ULxxxxxxxxxxx 231 | * This works only when there are exactly 11 characters after 'UL' 232 | * "LM" -- music.youtube likes 233 | * "RDMM" -- music.youtube your mix 234 | * "RDAMVM" -- music.youtube band mix 235 | * "RDAO<22chars>" 236 | * "RDAMPL" + prefix+playlistid 237 | * "RDCLAK5uy_" + 33chars 238 | * "RDTMAK5uy_" + 33chars 239 | 240 | * prefixes EL, CL also seem to have a special meaning. 241 | 242 | 243 | ### post id's 244 | 245 | * 26 characters: Ug<17chars>4AaABCQ 246 | * id's start with [wxyz] : 0b1100xx 247 | * id's end with [BFJNRVZdhlptx159] : 0bxxxx01 248 | -> 2 + 15*6 + 4 = 96 bits 249 | 250 | # Youtube url's 251 | 252 | Domains: 253 | 254 | youtu.be 255 | youtube.com 256 | 257 | UrlPath: 258 | 259 | /watch?v=&t=123s&list= 260 | /v/ 261 | /embed/ 262 | /embed/videoseries?list= 263 | /watch/ 264 | /playlist?list= 265 | /channel/ 266 | /user/ 267 | /watch_videos?video_ids=,,... 268 | 269 | # protoc 270 | 271 | Some id's are base64 encoded protobuf packets, like: clickTrackingParams, continuation. 272 | 273 | 274 | # Research tool 275 | 276 | I added a tool: `ytdump.py`, which i use to investigate youtube json dictionaries. 277 | 278 | # TODO 279 | 280 | * DONE extract 'listid' from video links for playlist view. 281 | * DONE list a channel's video's 282 | * DONE list a user's video's 283 | * handle radio links 284 | * DONE extract live-chat comments 285 | * Filter out duplicates from the livechat replay dump. 286 | * DONE make my tool work with an actual live chat. 287 | * DONE youtube search results. 288 | * generalize the way continuations are used. 289 | * add upload date and duration in the video lists. 290 | * DONE automatically update the innertubeapikey and clientversion 291 | * get original filename from studio.youtube.com/video//edit 292 | * playlist editor / organiser 293 | * community post listing 294 | * list all on video messages, like cards, etc. 295 | * list video markers, like in https://www.youtube.com/watch?v=i2KdE-cYMJk 296 | * list other videos from the same channel. 297 | * add time, likes to comments 298 | * repair the `--replay` option. 299 | 300 | 301 | # AUTHOR 302 | 303 | Willem Hengeveld 304 | 305 | -------------------------------------------------------------------------------- /yttool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | A tool for extracting useful information from youtube video's, like comments, or subtitles. 4 | 5 | Author: Willem Hengeveld 6 | 7 | """ 8 | 9 | import urllib.request 10 | import urllib.parse 11 | import http.cookiejar 12 | import re 13 | import json 14 | import sys 15 | import html 16 | import datetime 17 | from collections import defaultdict 18 | from xml.parsers.expat import ParserCreate 19 | 20 | import http.client 21 | 22 | 23 | def load_socks_proxy(proxyarg): 24 | m = re.match(r'(?:(\w+)://)?(\S+):(\d+)', proxyarg) 25 | if not m: 26 | return 27 | method, host, port = m.groups() 28 | port = int(port) 29 | 30 | if not method or not method.startswith('socks'): 31 | return 32 | 33 | import socks 34 | socks.setdefaultproxy(socks.SOCKS4 if method.startswith('socks4') else socks.SOCKS5, host, port) 35 | def create_connection(address, timeout=None, source_address=None): 36 | sock = socks.socksocket() 37 | sock.connect(address) 38 | return sock 39 | import socket 40 | socket.create_connection = create_connection 41 | socket.socket = socks.socksocket 42 | 43 | 44 | def decode_proxy(proxyarg): 45 | if m:= re.match(r'(?:(\w+)://)?(\S+):(\d+)', proxyarg): 46 | method, host, port = m.groups() 47 | port = int(port) 48 | if not method or method.startswith('http'): 49 | return { 'http': proxyarg, 'https': proxyarg } 50 | 51 | def cvdate(txt): 52 | """ 53 | Convert a string with a date in ymd format to a date object. 54 | """ 55 | ymd = txt.split("-") 56 | if len(ymd)!=3: 57 | print("WARNING: invalid date format: %s" % txt) 58 | return 59 | y, m, d = [int(_) for _ in ymd] 60 | return datetime.date(y, m, d) 61 | 62 | 63 | def cvseconds(txt): 64 | """ 65 | Convert string containing a number of seconds to a timedelta object. 66 | """ 67 | return datetime.timedelta(seconds=int(txt)) 68 | 69 | 70 | def getitembymember(a, member): 71 | """ 72 | Get the first item from 'a' which has an element named 'member' 73 | """ 74 | for item in a: 75 | if member in item: 76 | return item 77 | 78 | 79 | def getitem(d, *path): 80 | """ 81 | Traverse a nested python object, path items select which object is selected: 82 | * a tuple: selects a dictionary from a list which contains the specified key 83 | * an integer: select the specified item from a list. 84 | * a string: select the specified item from a dictionary. 85 | """ 86 | for k in path: 87 | if d is None: 88 | return 89 | if type(k) == tuple: 90 | d = getitembymember(d, *k) 91 | elif type(k) == int: 92 | d = d[k] 93 | else: 94 | d = d.get(k) 95 | 96 | return d 97 | 98 | def extracttext(entry): 99 | return entry.get("simpleText") or "".join(r.get('text', "") for r in entry.get("runs")) 100 | 101 | 102 | def getcontinuation(p): 103 | cont = getitem(p, "contents", 0, "continuationItemRenderer") 104 | if cont: 105 | return cont 106 | 107 | p = getitem(p, "continuations", 0, "nextContinuationData") 108 | if p: 109 | return p["continuation"], p["clickTrackingParams"] 110 | 111 | 112 | class Youtube: 113 | """ 114 | Class which knows how to get information from youtune video's 115 | """ 116 | def __init__(self, args): 117 | self.args = args 118 | cj = http.cookiejar.CookieJar() 119 | cj.set_cookie(http.cookiejar.Cookie(version=0, name="CONSENT", value="YES+cb.20210420-15-p1.en+FX+374", port=None, port_specified=False, domain=".youtube.com", domain_specified=True, domain_initial_dot=True, path="/", path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest={})) 120 | 121 | self.cp = urllib.request.HTTPCookieProcessor(cj) 122 | 123 | handlers = [self.cp] 124 | if args.proxy: 125 | proxies = decode_proxy(args.proxy) 126 | if proxies: 127 | handlers.append(urllib.request.ProxyHandler(proxies)) 128 | if args.debug: 129 | handlers.append(urllib.request.HTTPSHandler(debuglevel=1)) 130 | self.opener = urllib.request.build_opener(*handlers) 131 | self.innertubeapikey = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8" # "INNERTUBE_API_KEY": "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", 132 | self.clientname = "1" # "INNERTUBE_CONTEXT_CLIENT_NAME": 1, 133 | self.clientversion = "2.20210422.04.00" # "INNERTUBE_CONTEXT_CLIENT_VERSION": "2.20210404.08.00", 134 | self.idtoken = "QUFFLUhqa1oySl9mbm9mODhfdENjQWdDcENvazM2RS1qZ3w=" # "ID_TOKEN": "QUFFLUhqa1oySl9mbm9mODhfdENjQWdDcENvazM2RS1qZ3w=", 135 | 136 | html = self.httpreq("https://www.youtube.com/") 137 | cfg = self.getytcfg(html.decode('utf-8')) 138 | 139 | self.innertubeapikey = cfg.get("INNERTUBE_API_KEY") 140 | self.clientname = cfg.get("INNERTUBE_CONTEXT_CLIENT_NAME") 141 | self.clientversion = cfg.get("INNERTUBE_CONTEXT_CLIENT_VERSION") 142 | 143 | 144 | def httpreq(self, url, data=None): 145 | """ 146 | Does GET or POST request to youtube. 147 | """ 148 | hdrs = { 149 | "x-youtube-client-name": "1", 150 | "x-youtube-client-version": self.clientversion, 151 | #"X-Youtube-Identity-Token": self.idtoken, 152 | "User-Agent": "Mozilla/5.0 (Mac) Gecko/20100101 Firefox/76.0", 153 | } 154 | if type(data)==bytes and data[:1] in (b'{', b'['): 155 | hdrs["Content-Type"] = "application/json" 156 | 157 | req = urllib.request.Request(url, headers=hdrs) 158 | 159 | kwargs = dict() 160 | if data is not None: 161 | kwargs["data"] = data 162 | 163 | response = self.opener.open(req, **kwargs) 164 | try: 165 | page = response.read() 166 | except http.client.IncompleteRead as e: 167 | page = e.partial 168 | print("EXCEPTION FOUND: http.client.IncompleteRead") 169 | pass 170 | 171 | return page 172 | 173 | def getcomments(self, cont, xsrf, replies=False): 174 | """ 175 | Returns comments for the specified continuation parameter. 176 | """ 177 | cmd = getitem(cont, "continuationEndpoint") or getitem(cont, "button", "buttonRenderer", "command") 178 | url = getitem(cmd, "commandMetadata", "webCommandMetadata", "apiUrl") 179 | postreq = { 180 | "context":{"client":{"clientName":"WEB","clientVersion":self.clientversion}}, 181 | "continuation": getitem(cmd, "continuationCommand", "token"), 182 | } 183 | 184 | return self.httpreq("https://www.youtube.com" + url + "?" + urllib.parse.urlencode({"key":self.innertubeapikey}), json.dumps(postreq).encode('utf-8') ) 185 | 186 | def getchat(self, cont, live=False): 187 | """ 188 | Returns chat for the specified continuation parameter. 189 | """ 190 | if live: 191 | url = "https://www.youtube.com/live_chat" 192 | else: 193 | url = "https://www.youtube.com/live_chat_replay" 194 | query = { 195 | "pbj": 1, 196 | "continuation": cont, 197 | } 198 | 199 | return self.httpreq(url + "?" + urllib.parse.urlencode(query)) 200 | 201 | def getchat2(self, cont, offset, live=False): 202 | """ 203 | Returns chat for the specified continuation parameter. 204 | """ 205 | if live: 206 | url = "https://www.youtube.com/youtubei/v1/live_chat_replay/get_live_chat" 207 | else: 208 | url = "https://www.youtube.com/youtubei/v1/live_chat_replay/get_live_chat_replay" 209 | query = { 210 | "pbj": 1, 211 | "continuation": cont, 212 | "playerOffsetMs": offset, 213 | "hidden": False, 214 | "commandMetadata": "[object Object]", 215 | } 216 | 217 | return self.httpreq(url + "?" + urllib.parse.urlencode(query)) 218 | 219 | def getlivechat(self, cont): 220 | url = "https://www.youtube.com/youtubei/v1/live_chat/get_live_chat" 221 | query = { "key": self.innertubeapikey, } 222 | postdata = { 223 | "context": { "client": { "clientName": "WEB", "clientVersion": self.clientversion } }, 224 | "continuation": cont 225 | } 226 | 227 | return self.httpreq(url + "?" + urllib.parse.urlencode(query), json.dumps(postdata).encode('utf-8')) 228 | 229 | 230 | def getsearch(self, cont): 231 | """ 232 | Returns next batch of search results 233 | """ 234 | url = "https://www.youtube.com/youtubei/v1/search" 235 | query = { 236 | "key": self.innertubeapikey 237 | } 238 | postdata = { 239 | "context": { "client": { "clientName": "WEB", "clientVersion": self.clientversion } }, 240 | "continuation": cont, 241 | } 242 | postdata = json.dumps(postdata) 243 | return self.httpreq(url + "?" + urllib.parse.urlencode(query), postdata.encode('ascii')) 244 | 245 | def browse(self, cont): 246 | """ 247 | Returns videos for the specified continuation parameter. 248 | """ 249 | cmd = getitem(cont, "continuationEndpoint") 250 | url = getitem(cmd, "commandMetadata", "webCommandMetadata", "apiUrl") 251 | postreq = { 252 | "context":{"client":{"clientName":"WEB","clientVersion":self.clientversion}}, 253 | "continuation": getitem(cmd, "continuationCommand", "token"), 254 | } 255 | 256 | return self.httpreq("https://www.youtube.com" + url + "?" + urllib.parse.urlencode({"key":self.innertubeapikey}), json.dumps(postreq).encode('utf-8') ) 257 | 258 | 259 | def getpageinfo(self, yturl): 260 | """ 261 | Returns the youtube configuration object. 262 | """ 263 | ytcfgtext = self.httpreq(yturl + ("&" if yturl.find('?')>=0 else "?") + "pbj=1") 264 | if self.args.debug: 265 | print("============ youtube config") 266 | print(ytcfgtext.decode('utf-8')) 267 | print() 268 | 269 | try: 270 | return json.loads(ytcfgtext.lstrip(b")]}'")) 271 | except Exception as e: 272 | if self.args.verbose: 273 | print("EXCEPTION in getpageinfo: %s" % e) 274 | if self.args.debug: 275 | raise 276 | return 277 | 278 | def getytcfg(self, ythtml): 279 | ytcfg = {} 280 | for m in re.finditer(r'ytcfg\.set\((\{.*?\})\)', ythtml): 281 | jsontxt = m.group(1).replace("'", '"').replace('",}', '"}') 282 | ytcfg.update(json.loads(jsontxt)) 283 | return ytcfg 284 | 285 | 286 | def getconfigfromhtml(self, ythtml): 287 | """ 288 | Alternative method of extracting the config object. 289 | By parsing the html page returned by youtube. 290 | """ 291 | if self.args.debug: 292 | print("============ youtube page") 293 | print(ythtml.decode('utf-8')) 294 | print() 295 | 296 | m = re.search(br'ytplayer.config = (.*?);ytplayer.load', ythtml) 297 | if not m: 298 | print("could not find config") 299 | return 300 | cfgtext = m.group(1) 301 | if self.args.debug: 302 | print("========== config json") 303 | print(cfgtext.decode('utf-8')) 304 | print() 305 | 306 | cfg = json.loads(cfgtext) 307 | 308 | playertext = cfg['args']['player_response'] 309 | if self.args.debug: 310 | print("========== player json") 311 | print(playertext) 312 | print() 313 | return json.loads(playertext) 314 | 315 | def extractsearchconfig(self, html): 316 | if self.args.debug: 317 | print("============ youtube page") 318 | print(html.decode('utf-8')) 319 | print() 320 | m = re.search(br'window["ytInitialData"] = (.*);', html) 321 | if not m: 322 | print("could not find config") 323 | return 324 | cfgtext = m.group(1) 325 | if self.args.debug: 326 | print("========== config json") 327 | print(cfgtext.decode('utf-8')) 328 | print() 329 | 330 | return json.loads(cfgtext) 331 | 332 | def strunescape(txt): 333 | txt = re.sub(r'\\x(\w\w)', lambda m:chr(int(m.group(1), 16)), txt) 334 | txt = re.sub(r'\\n', "\n", txt) 335 | txt = re.sub(r'\\r', "\r", txt) 336 | txt = re.sub(r'\\t', "\t", txt) 337 | txt = re.sub(r'\\/', "/", txt) 338 | return txt 339 | 340 | def filterhtml(html): 341 | """ 342 | extract 4 different dictionaries from the html page. 343 | -- ytInitialPlayerResponse 344 | -- ytcfg.set() 345 | -- ytplayer.web_player_context_config 346 | -- ytInitialData 347 | """ 348 | 349 | result = {} 350 | for m in re.finditer(r'ytcfg\.set\(([^{}]*?),([^{}]*?)\)', html): 351 | #print("yt1", m.groups()) 352 | pass 353 | result["ytcfg"] = {} 354 | for m in re.finditer(r'ytcfg\.set\((\{.*?\})\)', html): 355 | #print("yt2", m.group(1)) 356 | jsontxt = m.group(1).replace("'", '"').replace('",}', '"}') 357 | result["ytcfg"].update(json.loads(jsontxt)) 358 | # TIMING_INFO.cver: "2.20210111.08.00", 359 | 360 | if m := re.search(r'', html): 361 | #print("ld", m.group(1)) 362 | result["ldjson"] = json.loads(strunescape(m.group(1))) 363 | 364 | if m := re.search(r'ytplayer.web_player_context_config = (\{.*?\});', html): 365 | #print("cfg", m.group(1)) 366 | result["playercg"] = json.loads(m.group(1)) 367 | # device.interfaceVersion: "2.20210111.08.00", 368 | # "innertubeApiKey": "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8", 369 | # "innertubeContextClientVersion": "2.20210111.08.00", 370 | 371 | if m := re.search(r'setMessage\((\{.*?\})\);', html): 372 | #print("msg", m.group(1)) 373 | result["msg"] = json.loads(m.group(1)) 374 | 375 | if m := re.search(r']*>var ytInitialPlayerResponse = (\{.*?\});', html): 376 | #print("initplayer", m.group(1)) 377 | result["initplayer"] = json.loads(m.group(1)) 378 | # note: this is the same as pbj.[].playerResponse 379 | 380 | if m := re.search(r']*>var ytInitialData = (\{.*?\});', html): 381 | #print("initdata", m.group(1)) 382 | result["initdata"] = json.loads(m.group(1)) 383 | # note: this is the same as pbj.[].response 384 | if m := re.search(r']*>window\["ytInitialData"\] = (\{.*?\});', html): 385 | #print("initdata", m.group(1)) 386 | result["initdata"] = json.loads(m.group(1)) 387 | # note: this is the same as pbj.[].response 388 | 389 | 390 | return result 391 | 392 | 393 | 394 | class LivechatReader: 395 | """ 396 | class reads a livechat or livechat replay. 397 | """ 398 | def __init__(self, args, yt, cfg, live=False): 399 | self.args = args 400 | self.yt = yt 401 | self.live = live 402 | self.cont = self.getchatinfo(cfg) 403 | 404 | def getcontinuation(self, p): 405 | p = getitem(p, "continuations", 0, "reloadContinuationData") 406 | if not p: 407 | return 408 | return p["continuation"] 409 | 410 | def getchatinfo(self, cfg): 411 | """ 412 | Find the base parameters for querying the video's comments. 413 | 414 | """ 415 | item = getitem(cfg, "initdata", "contents", "twoColumnWatchNextResults", "conversationBar", "liveChatRenderer") 416 | if not item: 417 | return 418 | 419 | return self.getcontinuation(item) 420 | 421 | def recursechat(self): 422 | if not self.cont: 423 | print("no live chat replay found") 424 | return 425 | ms = 0 426 | while True: 427 | #cmtjson = self.yt.getchat2(self.cont, ms, self.live) 428 | cmtjson = self.yt.getchat(self.cont, self.live) 429 | if self.args.debug: 430 | print("============ chat req") 431 | print(cmtjson.decode('utf-8')) 432 | print() 433 | if cmtjson.startswith(b"", time, author) 448 | print(extracttext(comment)) 449 | 450 | ms = newms 451 | 452 | print("========== live ===========") 453 | 454 | self.monitorchat(js["initdata"]) 455 | 456 | 457 | def extractchat(self, js): 458 | actions = getitem(js, "continuationContents", "liveChatContinuation", "actions") 459 | if not actions: 460 | return [], None 461 | 462 | cmtlist = [] 463 | ms = None 464 | 465 | def addchatitem(item): 466 | msg = getitem(item, "message") 467 | author = getitem(item, "authorName", "simpleText") 468 | time = getitem(item, "timestampText", "simpleText") 469 | if time is None: 470 | timeusec = getitem(item, "timestampUsec") 471 | if timeusec is not None: 472 | dt = datetime.datetime.fromtimestamp(int(timeusec)/1000000) 473 | time = dt.strftime("%Y-%m-%d %H:%M:%S") 474 | 475 | cmtlist.append((author, time, msg)) 476 | 477 | for act in actions: 478 | replayactions = getitem(act, "replayChatItemAction", "actions") 479 | ms = getitem(act, "replayChatItemAction", "videoOffsetTimeMsec") 480 | 481 | if replayactions: 482 | for ract in replayactions: 483 | item = getitem(ract, "addChatItemAction", "item", "liveChatTextMessageRenderer") 484 | if item: 485 | addchatitem(item) 486 | 487 | item = getitem(act, "addChatItemAction", "item", "liveChatTextMessageRenderer") 488 | if item: 489 | addchatitem(item) 490 | 491 | return cmtlist, ms 492 | 493 | def monitorchat(self, js): 494 | while True: 495 | cont = getitem(js, "continuationContents", "liveChatContinuation", "continuations", 0, "invalidationContinuationData", "continuation") 496 | respjson = self.yt.getlivechat(cont) 497 | if self.args.debug: 498 | print("============ comment req") 499 | print(respjson.decode('utf-8')) 500 | print() 501 | js = json.loads(respjson) 502 | 503 | cmtlist, newms = self.extractchat(js) 504 | 505 | for author, time, comment in cmtlist: 506 | print("--->", time, author) 507 | print(extracttext(comment)) 508 | sys.stdout.flush() 509 | 510 | import time 511 | time.sleep(1) 512 | 513 | 514 | 515 | class CommentReader: 516 | """ 517 | class which can recursively print comments 518 | """ 519 | def __init__(self, args, yt, cfg): 520 | self.args = args 521 | self.yt = yt 522 | self.contclick, self.xsrf = self.getcommentinfo(cfg) 523 | 524 | def recursecomments(self, cc=None, level=0): 525 | if not cc and not level: 526 | cc = self.contclick 527 | while cc: 528 | cmtjson = self.yt.getcomments(cc, self.xsrf, replies=(level>0)) 529 | if self.args.debug: 530 | print("============ comment req") 531 | print(cmtjson.decode('utf-8')) 532 | print() 533 | 534 | if not cmtjson: 535 | raise Exception("empty response") 536 | 537 | js = json.loads(cmtjson) 538 | 539 | if type(js)==list: 540 | # this is for 'replies', which return an array instead of a dict as the top-level response. 541 | js = getitem(js, ("response",)) 542 | 543 | cmtlist, cc = self.extractcomments(js) 544 | 545 | for author, when, comment, likes, replies, subcc in cmtlist: 546 | if self.args.verbose: 547 | print("---" * (level+1) + ">", "%s ; %s ; %s likes ; %s replies" % (author, when, likes, replies)) 548 | else: 549 | print("---" * (level+1) + ">", author) 550 | print(extracttext(comment)) 551 | if subcc: 552 | self.recursecomments(subcc, level+1) 553 | 554 | def getcommentinfo(self, cfg): 555 | """ 556 | Find the base parameters for querying the video's comments. 557 | 558 | """ 559 | item = getitem(cfg, "initdata", "contents", "twoColumnWatchNextResults", "results", "results", "contents") 560 | cont = getcontinuation(getitem(item, ("itemSectionRenderer",), "itemSectionRenderer")) 561 | xsrftoken = getitem(cfg, "ytcfg", "XSRF_TOKEN") 562 | xsrffield = getitem(cfg, "ytcfg", "XSRF_FIELD_NAME") 563 | 564 | xsrfdict = { xsrffield: xsrftoken } if xsrftoken else {} 565 | 566 | return cont, xsrfdict 567 | 568 | def getcomment(self, p): 569 | """ 570 | Return info for a single comment. 571 | """ 572 | if "commentThreadRenderer" in p: 573 | p = p["commentThreadRenderer"] 574 | 575 | c = p 576 | r = p 577 | if "comment" in c: 578 | c = c["comment"] 579 | if "commentRenderer" in c: 580 | c = c["commentRenderer"] 581 | if "replies" in r: 582 | r = r["replies"] 583 | 584 | author = getitem(c, "authorText", "simpleText") 585 | content = getitem(c, "contentText") 586 | likes = getitem(c, "likeCount") 587 | nrreplies = getitem(c, "replyCount") 588 | when = extracttext(getitem(c, "publishedTimeText")) 589 | replies = getitem(r, "commentRepliesRenderer") 590 | if replies: 591 | cont = getcontinuation(replies) 592 | else: 593 | cont = None 594 | 595 | return author, when, content, int(likes or 0), int(nrreplies or 0), cont 596 | 597 | def extractcomments(self, js): 598 | """ 599 | Extract a list of comments from comment dictionary 600 | """ 601 | endpoints = getitem(js, "onResponseReceivedEndpoints") 602 | if not endpoints: 603 | return [], None 604 | 605 | cc = None 606 | cmtlist = [] 607 | for p in endpoints: 608 | items = getitem(p, "reloadContinuationItemsCommand", "continuationItems") 609 | if not items: 610 | items = getitem(p, "appendContinuationItemsAction", "continuationItems") 611 | if not items: 612 | continue 613 | for p in items: 614 | c = getitem(p, "commentThreadRenderer") 615 | if c: 616 | cmtlist.append(self.getcomment(c)) 617 | c = getitem(p, "commentRenderer") 618 | if c: 619 | cmtlist.append(self.getcomment(c)) 620 | c = getitem(p, "continuationItemRenderer") 621 | if c: 622 | cc = c 623 | 624 | return cmtlist, cc 625 | 626 | 627 | class SearchReader: 628 | def __init__(self, args, yt, cfg): 629 | self.args = args 630 | self.yt = yt 631 | self.cfg = cfg 632 | 633 | def getresults(self, js): 634 | ct = getitem(js, "contents", "twoColumnSearchResultsRenderer", "primaryContents", "sectionListRenderer", "contents") 635 | if not ct: 636 | ct = getitem(js, "onResponseReceivedCommands", 0, "appendContinuationItemsAction", "continuationItems") 637 | 638 | resultlist = getitem(ct, ("itemSectionRenderer",), "itemSectionRenderer", "contents") 639 | cont = getitem(ct, ("continuationItemRenderer",), "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token") 640 | 641 | return resultlist, cont 642 | 643 | def recursesearch(self): 644 | 645 | resultlist, cont = self.getresults(getitem(self.cfg, "initdata")) 646 | while True: 647 | for item in resultlist: 648 | if video := item.get("videoRenderer"): 649 | vid = getitem(video, "videoId") 650 | pub = getitem(video, "publishedTimeText", "simpleText") 651 | title = getitem(video, "title") 652 | # title -> runs 653 | # descriptionSnippet -> runs 654 | # publishedTimeText -> simpleText 655 | # lengthText -> simpleText 656 | # viewCountText -> simpleText 657 | # ownerText -> runs 658 | print("%s - %s" % (vid, extracttext(title))) 659 | elif chan := item.get("channelRenderer"): 660 | cid = getitem(chan, "channelId") 661 | title = getitem(chan, "title", "simpleText") 662 | # "videoCountText" -> runs 663 | # subscriberCountText -> simpleText 664 | # descriptionSnippet -> runs 665 | print("%s - %s" % (cid, title)) 666 | 667 | jstext = self.yt.getsearch(cont) 668 | js = json.loads(jstext) 669 | resultlist, cont = self.getresults(js) 670 | 671 | 672 | class DetailReader: 673 | """ 674 | Extract some details for a video from the config. 675 | """ 676 | def __init__(self, args, yt, cfg): 677 | self.args = args 678 | self.yt = yt 679 | self.cfg = cfg 680 | 681 | def output(self): 682 | vd = getitem(self.cfg, "initplayer", "videoDetails") 683 | mf = getitem(self.cfg, "initplayer", "microformat", "playerMicroformatRenderer") 684 | twocol = getitem(self.cfg, "initdata", "contents", "twoColumnWatchNextResults", "results", "results", "contents") 685 | sentiment = getitem(twocol, ("videoPrimaryInfoRenderer",), "videoPrimaryInfoRenderer", "sentimentBar", "sentimentBarRenderer", "tooltip") 686 | 687 | if not mf: 688 | print("microformat not found") 689 | return 690 | 691 | vc = int(mf.get("viewCount")) 692 | ls = cvseconds(mf.get("lengthSeconds")) 693 | pd = cvdate(mf.get("publishDate")) 694 | ud = cvdate(mf.get("uploadDate")) 695 | desc = getitem(mf, "description", "simpleText") 696 | 697 | vid = vd.get("videoId") 698 | 699 | title = getitem(mf, "title", "simpleText") 700 | owner = getitem(mf, "ownerChannelName") 701 | 702 | print("%s - %s" % (vid, title)) 703 | print("By: %s" % (owner)) 704 | print() 705 | print("viewcount: %d, length: %s, sentiment: %s, published: %s%s" % (vc, ls, sentiment, pd, "" if pd==ud else ", uploaded at: %s" % ud)) 706 | print() 707 | print("%s" % desc) 708 | print() 709 | 710 | 711 | class SubtitleReader: 712 | """ 713 | class which can print a video's subtitles 714 | """ 715 | def __init__(self, args, yt, cfg): 716 | self.args = args 717 | self.yt = yt 718 | self.cfg = cfg 719 | 720 | def languagematches(self, language, ct): 721 | """ 722 | Match a captionTrack record to the language filter. 723 | """ 724 | if language == 'asr' and ct.get('kind') == 'asr': 725 | return True 726 | if ct["name"]["simpleText"] == language: 727 | return True 728 | if ct["languageCode"] == language: 729 | return True 730 | 731 | def output(self): 732 | js = getitem(self.cfg, "initplayer") 733 | p = getitem(js, "captions", "playerCaptionsTracklistRenderer", "captionTracks") 734 | 735 | if not p: 736 | print("no subtitles found") 737 | return 738 | 739 | captiontracks = p 740 | 741 | # filter subtitles based on language 742 | if self.args.language: 743 | captiontracks = self.filtertracks(self.args.language, captiontracks) 744 | 745 | for ct in captiontracks: 746 | if len(captiontracks) > 1: 747 | print("### %s ###" % ct["name"]["simpleText"]) 748 | 749 | self.outputsubtitles(ct["baseUrl"]) 750 | 751 | if len(captiontracks) > 1: 752 | print() 753 | 754 | def filtertracks(self, language, captiontracks): 755 | matchedtracks = defaultdict(list) 756 | for ct in captiontracks: 757 | if not self.languagematches(language, ct): 758 | continue 759 | 760 | matchedtracks[ct["languageCode"]].append(ct) 761 | 762 | filteredlist = [] 763 | for lang, tracks in matchedtracks.items(): 764 | if len(tracks) > 1: 765 | # prefer non automated translation 766 | tracks = filter(lambda ct:ct.get("kind") != "asr", tracks) 767 | filteredlist.extend(tracks) 768 | 769 | return filteredlist 770 | 771 | def outputsubtitles(self, cturl): 772 | ttxml = self.yt.httpreq(cturl) 773 | if self.args.debug: 774 | print("========== timedtext xml") 775 | print(ttxml.decode('utf-8')) 776 | print() 777 | tt = self.extractxmltext(ttxml) 778 | 779 | if self.args.srt: 780 | self.output_srt(tt) 781 | elif self.args.verbose: 782 | for t0, t1, txt in tt: 783 | print("%s %s" % (self.formattime(t0), txt)) 784 | else: 785 | for t0, t1, txt in tt: 786 | print(txt) 787 | 788 | @staticmethod 789 | def formattime(t): 790 | m = int(t/60) ; t -= 60*m 791 | h = int(m/60) ; m -= 60*h 792 | return "%d:%02d:%06.3f" % (h, m, t) 793 | 794 | @staticmethod 795 | def srttime(t): 796 | return SubtitleReader.formattime(t).replace('.', ',') 797 | 798 | @staticmethod 799 | def output_srt(tt): 800 | n = 1 801 | for t0, t1, txt in tt: 802 | print(n) 803 | print("%s --> %s" % (SubtitleReader.srttime(t0), SubtitleReader.srttime(t1))) 804 | print(txt) 805 | print() 806 | 807 | @staticmethod 808 | def unhtml(htmltext): 809 | """ 810 | Removes html font tags, and decodes html entities 811 | """ 812 | return html.unescape(re.sub(r']*>', '', htmltext)) 813 | 814 | def extractxmltext(self, xml): 815 | """ 816 | Returns a list of tuples: time, endtime, text 817 | """ 818 | lines = [] 819 | tstart = None 820 | tend = None 821 | text = None 822 | def handle_begin_element(elem, attr): 823 | nonlocal text, tstart, tend 824 | if elem == 'text': 825 | text = "" 826 | tstart = float(attr.get('start')) 827 | tend = tstart + float(attr.get('dur')) 828 | 829 | def handle_end_element(elem): 830 | nonlocal text 831 | if elem == 'text': 832 | lines.append((tstart, tend, self.unhtml(text))) 833 | text = None 834 | def handle_data(data): 835 | nonlocal text 836 | if text is not None: 837 | text += data 838 | 839 | parser = ParserCreate() 840 | parser.StartElementHandler = handle_begin_element 841 | parser.EndElementHandler = handle_end_element 842 | parser.CharacterDataHandler = handle_data 843 | parser.Parse(xml, 1) 844 | 845 | return lines 846 | 847 | 848 | class PlaylistReader: 849 | """ 850 | class which can print a playlist's contents. 851 | """ 852 | def __init__(self, args, yt, cfg): 853 | self.args = args 854 | self.yt = yt 855 | self.cfg = cfg 856 | 857 | def output(self): 858 | # ==== [ 'playlistVideoRenderer', 1, 'contents', 'playlistVideoListRenderer', 0, 'contents', 'itemSectionRenderer', 0, 'contents', 'sectionListRenderer', 'content', 'tabRenderer', 0, 'tabs', 'twoColumnBrowseResultsRenderer', 'contents', 'response', 1] 859 | # ==== ['gridVideoRenderer', 1, 'items', 'horizontalListRenderer', 'content', 'shelfRenderer', 0, 860 | # 'contents', 'itemSectionRenderer', 1, 'contents', 'sectionListRenderer', 'content', 'tabRenderer', 0, 861 | # 'tabs', 'twoColumnBrowseResultsRenderer', 'contents', 'response', 1] 862 | playlist = getitem(self.cfg, "initdata", "contents", "twoColumnWatchNextResults", "playlist") 863 | if playlist: 864 | print("Title: %s" % getitem(playlist, "playlist", "title")) 865 | for entry in getitem(playlist, "playlist", "contents"): 866 | vid = getitem(entry, "playlistPanelVideoRenderer", "videoId") 867 | title = getitem(entry, "playlistPanelVideoRenderer", "title", "simpleText") 868 | length = getitem(entry, "playlistPanelVideoRenderer", "lengthText", "simpleText") 869 | if args.verbose: 870 | print("%s - %s %s" % (vid, length, title)) 871 | else: 872 | print("%s - %s" % (vid, title)) 873 | return 874 | tabs = getitem(self.cfg, "initdata", "contents", "twoColumnBrowseResultsRenderer", "tabs", 0, "tabRenderer", "content") 875 | ct1 = getitem(tabs, "sectionListRenderer", "contents", 0, "itemSectionRenderer", "contents", 0) 876 | playlist = getitem(ct1, "playlistVideoListRenderer") 877 | list_tag = "contents" 878 | entry_tag = "playlistVideoRenderer" 879 | if not playlist: 880 | playlist = getitem(ctl, "shelfRenderer", "content", 'horizontalListRenderer') 881 | list_tag = "items" 882 | entry_tag = "gridVideoRenderer" 883 | if playlist: 884 | cont = None 885 | for entry in playlist[list_tag]: 886 | vid = getitem(entry, entry_tag, "videoId") 887 | title = getitem(entry, entry_tag, "title") 888 | if vid and title: 889 | print("%s - %s" % (vid, extracttext(title))) 890 | c = getitem(entry, "continuationItemRenderer") 891 | if c: 892 | cont = c 893 | 894 | if not cont: 895 | cont = getcontinuation(playlist) 896 | while cont: 897 | browsejson = self.yt.browse(cont) 898 | if self.args.debug: 899 | print("============ browse req") 900 | print(browsejson.decode('utf-8')) 901 | print() 902 | 903 | js = json.loads(browsejson) 904 | 905 | cont = None 906 | playlist = getitem(js, "initdata", "continuationContents", "gridContinuation") 907 | if playlist: 908 | for entry in getitem(playlist, "items"): 909 | vid = getitem(entry, "gridVideoRenderer", "videoId") 910 | title = getitem(entry, "gridVideoRenderer", "title") 911 | print("%s - %s" % (vid, extracttext(title))) 912 | playlist = getitem(js, "initdata", "continuationContents", "playlistVideoListContinuation") 913 | item_tag = "contents" 914 | if not playlist: 915 | playlist = getitem(js, "initdata", "onResponseReceivedActions", 0, "appendContinuationItemsAction") 916 | item_tag = "continuationItems" 917 | if not playlist: 918 | playlist = getitem(js, "onResponseReceivedActions", 0, "appendContinuationItemsAction") 919 | item_tag = "continuationItems" 920 | if playlist: 921 | for entry in getitem(playlist, item_tag): 922 | vid = getitem(entry, "playlistVideoRenderer", "videoId") 923 | title = getitem(entry, "playlistVideoRenderer", "title") 924 | if vid and title: 925 | print("%s - %s" % (vid, extracttext(title))) 926 | c = getitem(entry, "continuationItemRenderer") 927 | if c: 928 | cont = c 929 | 930 | if not playlist: 931 | break 932 | if not cont: 933 | cont = getcontinuation(playlist) 934 | 935 | return 936 | 937 | 938 | def parse_youtube_link(url): 939 | """ 940 | Recognize different types of youtube urls: 941 | 942 | http://, https:// 943 | 944 | youtu.be/[?list=] 945 | 946 | (?:www.)?youtube.com... 947 | 948 | /channel/ 949 | /c/ 950 | /playlist?list= 951 | /watch?v= [&t=pos] [&list=] 952 | /watch/ 953 | /v/ 954 | /embed/ 955 | /user/ 956 | /watch_videos?video_ids=,,... 957 | /results?search_query=... 958 | """ 959 | m = re.match(r'^(?:https?://)?(?:www\.)?(?:(?:youtu\.be|youtube\.com)/)?(.*)', url) 960 | if not m: 961 | raise Exception("youtube link not matched") 962 | 963 | path = m.group(1) 964 | 965 | if m := re.match(r'^user/([^/?]+)', path): 966 | yield 'username', m.group(1) 967 | elif m := re.match(r'^(\w+)/([A-Za-z0-9_-]+)(.*)', path): 968 | idtype = m.group(1) 969 | if idtype in ('v', 'embed', 'watch'): 970 | idtype = 'video' 971 | elif idtype in ('channel'): 972 | idtype = 'channel' 973 | elif idtype in ('c'): 974 | idtype = 'channelname' 975 | elif idtype in ('playlist'): 976 | idtype = 'playlist' 977 | else: 978 | raise Exception("unknown id type") 979 | 980 | idvalue = m.group(2) 981 | yield idtype, idvalue 982 | if idtype == 'channel': 983 | yield 'playlist', 'UU' + idvalue[2:] 984 | 985 | idargs = urllib.parse.parse_qs(m.group(3)) 986 | if idvalue := idargs.get('v'): 987 | if idvalue[0]: 988 | yield 'video', idvalue[0] 989 | if idvalue := idargs.get('list'): 990 | if idvalue[0]: 991 | yield 'playlist', idvalue[0] 992 | 993 | elif m := re.match(r'^(v|embed|watch|channel|playlist)(?:\?(.*))?$', path): 994 | idtype = m.group(1) 995 | if idtype in ('v', 'embed', 'watch'): 996 | idtype = 'video' 997 | elif idtype in ('channel'): 998 | idtype = 'channel' 999 | elif idtype in ('playlist'): 1000 | idtype = 'playlist' 1001 | 1002 | idargs = urllib.parse.parse_qs(m.group(2)) 1003 | if idvalue := idargs.get('v'): 1004 | if idvalue[0]: 1005 | yield 'video', idvalue[0] 1006 | if idvalue := idargs.get('list'): 1007 | if idvalue[0]: 1008 | yield 'playlist', idvalue[0] 1009 | 1010 | elif m := re.match(r'^results\?(.*)$', path): 1011 | idargs = urllib.parse.parse_qs(m.group(1)) 1012 | if idvalue := idargs.get('search_query'): 1013 | if idvalue[0]: 1014 | yield 'search', idvalue[0] 1015 | 1016 | elif m := re.match(r'^[A-Za-z0-9_-]+$', path): 1017 | if len(path)==11: 1018 | yield 'video', path 1019 | else: 1020 | yield 'playlist', path 1021 | 1022 | else: 1023 | raise Exception("unknown id") 1024 | 1025 | def channelurl_from_userpage(cfg): 1026 | return getitem(cfg, "initdata", "metadata", "channelMetadataRenderer", "channelUrl") 1027 | # or "initplayer", "microformat", "playerMicroformatRenderer", "externalChannelId" 1028 | # or "initplayer", "videoDetails", "channelId" 1029 | 1030 | def check_error(cfg): 1031 | status = getitem(cfg, "initplayer", "playabilityStatus") 1032 | if not status: 1033 | return 1034 | if status["status"] == "ERROR": 1035 | print(status["reason"]) 1036 | return True 1037 | 1038 | def main(): 1039 | import io 1040 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') 1041 | 1042 | import argparse 1043 | parser = argparse.ArgumentParser(description='Extract Youtube comments') 1044 | parser.add_argument('--debug', '-d', action='store_true', help='print all intermediate steps') 1045 | parser.add_argument('--verbose', '-v', action='store_true', help='prefix each line with the timestamp') 1046 | parser.add_argument('--comments', '-c', action='store_true', help='Print video comments') 1047 | parser.add_argument('--subtitles', '-t', action='store_true', help='Print video subtitles') 1048 | parser.add_argument('--language', type=str, help='Output only subtitles in the specified language') 1049 | parser.add_argument('--playlist', '-l', action='store_true', help='Print playlist items') 1050 | parser.add_argument('--info', '-i', action='store_true', help='Print video info') 1051 | parser.add_argument('--srt', action='store_true', help='Output subtitles in .srt format.') 1052 | parser.add_argument('--query', '-q', action='store_true', help='List videos matching the specified query') 1053 | parser.add_argument('--livechat', action='store_true', help='Follow livechat contents') 1054 | parser.add_argument('--replay', action='store_true', help='Print livechat replay') 1055 | parser.add_argument('--proxy', type=str, help='Specify a proxy to use.') 1056 | parser.add_argument('ytids', nargs='+', type=str, help='One or more Youtube URLs, or IDs, or a query') 1057 | args = parser.parse_args() 1058 | 1059 | if args.proxy and args.proxy.startswith('socks'): 1060 | load_socks_proxy(args.proxy) 1061 | 1062 | yt = Youtube(args) 1063 | 1064 | for url in args.ytids: 1065 | if len(args.ytids) > 1: 1066 | print("==>", url, "<==") 1067 | if args.query: 1068 | # note: the 'url' variable holds the query. 1069 | # convert it to a query url so the parse link function can decode it. 1070 | url = "https://www.youtube.com/results?" + urllib.parse.urlencode({"search_query": url}) 1071 | 1072 | # analyze url for id's, like videoid, channelid, playlistid or search query. 1073 | for idtype, idvalue in parse_youtube_link(url): 1074 | # reformat the url in a way that i am sure returns the right json data. 1075 | 1076 | if idtype == 'video': 1077 | url = "https://www.youtube.com/watch?v=%s" % idvalue 1078 | elif idtype == 'playlist': 1079 | url = "https://www.youtube.com/playlist?list=%s" % idvalue 1080 | elif idtype == 'channel': 1081 | url = "https://www.youtube.com/channel/%s" % idvalue 1082 | elif idtype == 'username': 1083 | url = "https://www.youtube.com/user/%s" % idvalue 1084 | elif idtype == 'search': 1085 | url = "https://www.youtube.com/results?" + urllib.parse.urlencode({"search_query": idvalue}) 1086 | 1087 | #cfg = yt.getpageinfo(url) 1088 | #if check_error(cfg): 1089 | # continue 1090 | html = yt.httpreq(url) 1091 | if args.debug: 1092 | print("============ youtube html") 1093 | print(html.decode('utf-8')) 1094 | print() 1095 | cfg = filterhtml(html.decode('utf-8')) 1096 | if args.debug: 1097 | print("============ youtube extracted config") 1098 | print(json.dumps(cfg)) 1099 | print() 1100 | 1101 | if idtype=='username': 1102 | url = channelurl_from_userpage(cfg) 1103 | args.ytids.append(url) 1104 | # note: the new url is processed in next loop iteration. 1105 | 1106 | if args.comments and idtype=='video': 1107 | cmt = CommentReader(args, yt, cfg) 1108 | cmt.recursecomments() 1109 | if args.subtitles and idtype=='video': 1110 | txt = SubtitleReader(args, yt, cfg) 1111 | txt.output() 1112 | if (args.replay or args.livechat) and idtype=='video': 1113 | txt = LivechatReader(args, yt, cfg, live=args.livechat) 1114 | txt.recursechat() 1115 | if args.playlist and idtype=='playlist': 1116 | lst = PlaylistReader(args, yt, cfg) 1117 | lst.output() 1118 | if (args.playlist or args.query) and idtype == 'search': 1119 | q = SearchReader(args, yt, cfg) 1120 | q.recursesearch() 1121 | if args.info and idtype=='video': 1122 | lst = DetailReader(args, yt, cfg) 1123 | lst.output() 1124 | 1125 | 1126 | if __name__ == '__main__': 1127 | main() 1128 | 1129 | 1130 | --------------------------------------------------------------------------------