├── Changelog.md
├── setup.py
├── LICENSE
├── ytdump.py
├── README.md
└── yttool.py


/Changelog.md:
--------------------------------------------------------------------------------
 1 | 1.0.11 - 2021-05-14
 2 |  * fixed commentlist problem with some videos.
 3 | 
 4 | 1.0.10 - 2021-04-25
 5 |  * --livechat now monitors the livechat
 6 | 
 7 | 1.0.9  - 2021-04-25
 8 |  * all options work again, after changes to the youtube api.
 9 | 
10 | 1.0.8  - 2021-03-29
11 |  * made comments work again.
12 | 
13 | 1.0.7  - 2021-02-16
14 |  * added optional socks / tor proxy
15 |  * output text as utf-8.
16 | 
17 | 1.0.6  - 2021-01-13
18 |  * repaired 'chat replay'
19 |  * repaired -l option
20 | 
21 | 1.0.5  - 2020-08-17
22 |  * fixed issue with 'too large request' error, for video's with lots of comments.
23 | 
24 | 1.0.4  - 2020-06-18   initial release
25 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | setup(
 3 |     name = "youtube_tool",
 4 |     version = "1.0.11",
 5 |     entry_points = {
 6 |         'console_scripts': ['yttool=yttool:main'],
 7 |     },
 8 |     py_modules=['yttool'],
 9 |     author = "Willem Hengeveld",
10 |     author_email = "itsme@xs4all.nl",
11 |     description = "Extract information from youtube video's",
12 |     long_description="""
13 | Commandline tool which can extract comments, subtitles or livechat
14 | content from a youtube video. It can also list all video's
15 | in a playlist, or from a search result.
16 | """,
17 | 
18 |     license = "MIT",
19 |     keywords = "youtube commandline",
20 |     url = "https://github.com/nlitsme/youtube_tool/",
21 |     classifiers = [
22 |         'Environment :: Console',
23 |         'Intended Audience :: End Users/Desktop',
24 |         'Intended Audience :: Developers',
25 |         'License :: OSI Approved :: MIT License',
26 |         'Operating System :: OS Independent',
27 |         'Programming Language :: Python :: 3',
28 |         'Topic :: Utilities',
29 |     ],
30 |     python_requires = '>=3.8',
31 | )
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Willem Hengeveld <itsme@xs4all.nl>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ytdump.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A tool for investigating youtube json dictionaries.
  3 | 
  4 | This tries to pretty print the rather complex json dictionaries youtube uses.
  5 | You can pass the json either through stdin, pass it as a string on the commandline,
  6 | or as a filename on the commandline.
  7 | 
  8 | 
  9 | Author: Willem Hengeveld <itsme@xs4all.nl>
 10 | """
 11 | import json
 12 | import sys
 13 | import os.path
 14 | 
 15 | 
 16 | def extractruns(runs):
 17 |     """
 18 |     Extract all text in a 'runs' dictionary.
 19 |     """
 20 |     text = []
 21 |     for r in runs:
 22 |         text.append(r.get('text'))
 23 |     return "".join(text)
 24 | 
 25 | 
 26 | def pathendswith(path, *end):
 27 |     """
 28 |     A helper for matching paths in the json dictionary.
 29 |     """
 30 |     if len(end) > len(path):
 31 |         return False
 32 |     for a, b in zip(path[-len(end):], end):
 33 |         if type(b)==type:
 34 |             if type(a)!=b:
 35 |                 return False
 36 |         elif type(b)==int:
 37 |             if a != b:
 38 |                 return False
 39 |         elif type(a)==int:
 40 |             return False
 41 |         elif b[:1] == '*':
 42 |             if not a.endswith(b[1:]):
 43 |                 return False
 44 |         else:
 45 |             if a != b:
 46 |                 return False
 47 |     return True
 48 | 
 49 | 
 50 | def processRender(j, path):
 51 |     """
 52 |     print all properties directly under 'j'
 53 |     """
 54 |     info = []
 55 |     for k, item in j.items():
 56 |         if type(item) in (int, float, str, bool):
 57 |             info.append((k, item))
 58 |         elif type(item) != dict:
 59 |             pass
 60 |         elif runs := item.get('runs'):
 61 |             info.append((k, extractruns(runs)))
 62 |         elif text := item.get("simpleText"):
 63 |             info.append((k, text))
 64 |     indent = "  " * len(path)
 65 |     print(indent, "==== %s" % (path[::-1],))
 66 |     for k, v in info:
 67 |         print(indent, "|    %-20s : %s" % (k, v))
 68 | 
 69 | 
 70 | def process(j, path=[]):
 71 |     """
 72 |     recursively process the json dictionary passed in 'j'.
 73 | 
 74 |     Printing all 'Renderer' dictionaries in detail, indented according to path length.
 75 | 
 76 |     The path is the list of keys needed to find the current entry from the top.
 77 |     """
 78 |     if path:
 79 |         if pathendswith(path, "*Renderer"):
 80 |             if type(j)!=dict:
 81 |                 print("WARNING: Renderer without dict", path)
 82 |             else:
 83 |                 processRender(j, path)
 84 |         elif pathendswith(path, "continuations"):
 85 |             if not pathendswith(path, "*Renderer", "continuations"):
 86 |                 print("WARNING: continuations without renderer", path)
 87 |             pass
 88 |         elif pathendswith(path, "nextContinuationData"):
 89 |             if not pathendswith(path, "continuations", int, "nextContinuationData"):
 90 |                 print("WARNING: nextContinuationData without continuations", path)
 91 |             pass
 92 |         elif pathendswith(path, "continuation"):
 93 |             if not pathendswith(path, "nextContinuationData", "continuation"):
 94 |                 print("WARNING: continuation without nextContinuationData", path)
 95 |             pass
 96 | 
 97 |     if type(j) == list:
 98 |         for i, item in enumerate(j):
 99 |             process(item, path + [i])
100 |     elif type(j) == dict:
101 |         for k, item in j.items():
102 |             process(item, path + [k])
103 |     elif type(j) in (int, float, str, bool, type(None)):
104 |         pass
105 |     else:
106 |         print("WARNING: unexpected type", type(j), j)
107 | 
108 | 
109 | def main():
110 |     if len(sys.argv)==1:
111 |         data = sys.stdin.read()
112 |         j = json.loads(data)
113 |         process(j)
114 |     else:
115 |         for arg in sys.argv[1:]:
116 |             if os.path.exists(arg):
117 |                 try:
118 |                     with open(arg, "r") as fh:
119 |                         print("==>", arg, "<==")
120 |                         j = json.load(fh)
121 |                         process(j)
122 |                 except Exception as e:
123 |                     print("ERROR reading %s: %s" % (arg, e))
124 |             else:
125 |                 print("==> json commandline argument <==")
126 |                 j = json.loads(arg)
127 |                 process(j)
128 | 
129 | if __name__ == '__main__':
130 |     main()
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # yttool
  2 | 
  3 | A tool for extracting info from youtube:
  4 |  * print all comments for a video
  5 |  * print a video's description + info
  6 |  * print all subtitles for a video
  7 |  * print out an entire livechat replay.
  8 |  * list all items in a playlist
  9 |  * list all videos for a channel or user
 10 |  * list all video's matching a query
 11 | 
 12 | # install
 13 | 
 14 | You can install this from the official python repository using `pip`:
 15 | 
 16 |     pip3 install youtube-tool
 17 | 
 18 | This will add a command `yttool` to your python binaries directory,
 19 | and probably also to your search path. So you can run this like:
 20 | 
 21 |     yttool ....arguments....
 22 | 
 23 | Note: depending on your local python installation(s), you may have to type
 24 | one of `pip`, `pip3`, or maybe even: `pip3.8`.
 25 | 
 26 | 
 27 | You can also 'install' this by executing the `yttool.py` file directly from
 28 | the source directory:
 29 | 
 30 |     python3 yttool.py  ....arguments...
 31 | 
 32 | 
 33 | # requirements
 34 | 
 35 | This script needs python 3.8 or later to run.
 36 | The python3.8 specific feature I am using is the new `:=` walrus operator.
 37 | 
 38 | 
 39 | # usage
 40 | 
 41 | ## list all subtitles attached to a video.
 42 | 
 43 | This will output the subtitles in all available languages.
 44 | 
 45 |     yttool --subtitles https://www.youtube.com/watch?v=bJOuzqu3MUQ
 46 | 
 47 | Or list the subtitles prefixed with timestamps
 48 | 
 49 |     yttool -v --subtitles https://www.youtube.com/watch?v=bJOuzqu3MUQ
 50 | 
 51 | 
 52 | You can also extract the subtitles in a format suitable for
 53 | creating `.srt` subtitle files:
 54 | 
 55 |     yttool --srt --subtitles https://www.youtube.com/watch?v=bJOuzqu3MUQ
 56 | 
 57 | 
 58 | Or you can filter by language, for example only output the english subtitles:
 59 | 
 60 |     yttool --language en --subtitles https://www.youtube.com/watch?v=0xY06PT5JDE
 61 | 
 62 | Or only output the automatically generated subtitles:
 63 | 
 64 |     yttool --language asr --subtitles https://www.youtube.com/watch?v=0xY06PT5JDE
 65 | 
 66 | 
 67 | ## comments
 68 | 
 69 | List all the comments for this Numberphile video:
 70 | 
 71 |     yttool --comments https://www.youtube.com/watch?v=bJOuzqu3MUQ
 72 | 
 73 | 
 74 | ## livechat replay
 75 | 
 76 | Print out an entire livechat replay:
 77 | 
 78 |     yttool --replay https://www.youtube.com/watch?v=lE0u_jIDh0E
 79 | 
 80 | ## follow an active livechat
 81 | 
 82 | Note: this does not yet work!
 83 | 
 84 | Print messages from a livechat as they come:
 85 | 
 86 |     yttool --livechat https://www.youtube.com/watch?v=EEIk7gwjgIM
 87 | 
 88 | 
 89 | ## list a playlist contents.
 90 | 
 91 | List all the video's contained in this System of a Down playlist:
 92 | 
 93 |     yttool --playlist https://www.youtube.com/playlist?list=PLSKnqXUHTaSdXuK8Z2d-hXLFtJbRZwPtJ
 94 | 
 95 | The output will look like this:
 96 | 
 97 |     CSvFpBOe8eY - System Of A Down - Chop Suey! (Official Video)
 98 |     zUzd9KyIDrM - System Of A Down - B.Y.O.B. (Official Video)
 99 |     L-iepu3EtyE - System Of A Down - Aerials (Official Video)
100 |     iywaBOMvYLI - System Of A Down - Toxicity (Official Video)
101 |     DnGdoEa1tPg - System Of A Down - Lonely Day (Official Video)
102 |     LoheCz4t2xc - System Of A Down - Hypnotize (Official Video)
103 |     5vBGOrI6yBk - System Of A Down - Sugar (Official Video)
104 |     SqZNMvIEHhs - System Of A Down - Spiders (Official Video)
105 |     ENBv2i88g6Y - System Of A Down - Question! (Official Video)
106 |     bE2r7r7VVic - System Of A Down - Boom! (Official Video)
107 |     F46r-_jPPHY - System Of A Down - War? (Official Video)
108 | 
109 | The first 11 characters are the video id, you can load the corresponding video
110 | by typing: `https://www.youtube.com/watch?v=5vBGOrI6yBk` in your browser's URL bar.
111 | 
112 | 
113 | Or list all video's from a channel:
114 | 
115 |     yttool -l https://www.youtube.com/channel/UCoxcjq-8xIDTYp3uz647V5A
116 | 
117 | Or when you don't know the channelid, you can get the same with the username:
118 | 
119 |     yttool -l https://www.youtube.com/user/numberphile
120 | 
121 | 
122 | ## list query results
123 | 
124 | This:
125 | 
126 |     yttool -q somequery
127 | 
128 | Will list first couple of the video's matching that query.
129 | 
130 | ## Just the id's
131 | 
132 | You can also call yttool with only the video id as an argument:
133 | 
134 |     yttool --info CSvFpBOe8eY
135 | 
136 | 
137 | # How to use with a proxy?
138 | 
139 | For example if you would like to use TOR, you would do this:
140 | 
141 |     yttool --proxy socks5://localhost:9050 --info https://www.youtube.com/watch?v=Ll-_LV9U1tA
142 | 
143 | Note that setting a socks proxy via the `https_proxy` environment variable does NOT work very well with python's urllib library.
144 | 
145 | 
146 | # How does it work?
147 | 
148 | This script does not use the official youtube API, instead, it uses youtube's internal api, which is
149 | what is used on the youtube website itself. This does mean there is no guarantee that this script
150 | will keep working without maintenance. Youtube will keep changing the way it works internally.
151 | So I will need to keep updating this script.
152 | 
153 | The advantage of using the internal API, is that there are apparently no limits to how many requests you
154 | can do. And you don't have to bother with any kind of registration.
155 | 
156 | 
157 | These are the main internal api urls I am using:
158 | 
159 |  - comments: `https://www.youtube.com/comment_service_ajax`
160 |  - livechat: `https://www.youtube.com/live_chat_replay/get_live_chat_replay`
161 |  - search: `https://www.youtube.com/youtubei/v1/search`
162 |  - playlists: `https://www.youtube.com/browse_ajax`
163 | 
164 | Also, you can get youtube to respond with json instead of html by adding a `&pbj=1` argument to most urls,
165 | and add http headers: `x-youtube-client-name: 1` and `x-youtube-client-version: 2.20200603.01.00` to your request.
166 | Also the user-agent header needs to be of the right format, see my script for a working example.
167 | 
168 | Then, for search you need to add a `innertubeapikey`. Which I have currently hardcoded in my script, as i did with the client-version.
169 | A future improvement would be to automatically extract these from the current youtube front page.
170 | 
171 | 
172 | # Note about the structure of youtube video id's
173 | 
174 | Youtube's id's are structured in several ways:
175 | 
176 | A videoid is 11 characters long, when decoded using base64, this results in exactly 8 bytes.
177 | The last character of a videoid can only be: `048AEIMQUYcgkosw`  --> 10x6+4 = 64 bits
178 | 
179 | A playlist id is either 24 or 34 characters long, and has the following format:
180 | 
181 | ### id's containing a 'playlist' id.
182 | 
183 |  * "PL<playlistid>" or "EC<playlistid>" -- custom playlist, or educational playlist.
184 |  * "BP<playlistid>" and "SP<playlistid>"  also seem to have some kind of function.
185 |  * playlistid can be:
186 |    * either 32 base64 characters --> either a 6x32 = 192 bits
187 |    * or or 16 hex characters --> either a 16x4 = 64 bits
188 |  * www.youtube.com/playlist?list=PL<playlistid>
189 |  * www.youtube.com/course?list=EC<playlistid>
190 |    * no longer works very well, the layout of the `course` page is broken,
191 |      with lots of overlapping text.
192 | 
193 | ### id's containing a channel id
194 | 
195 | A channel-id is 22 base64 characters, with the last character one of: `AQgw`, so this decodes to 21x6+2 = 128 bits
196 | 
197 |  * "UC<channelid>"  -- user channel
198 |    * www.youtube.com/channel/UC<channelid>
199 |  * "PU<channelid>"  -- popular uploads playlist
200 |    * quick way to load: www.youtube.com/watch?v=xxxxxxxxxxx&list=PU<channelid>
201 |  * "UU<channelid>"  -- user uploads playlist
202 |    * quick way to load: www.youtube.com/watch?v=xxxxxxxxxxx&list=UU<channelid>
203 |  * "LL<channelid>"  -- liked video's for user
204 |    * quick way to load: www.youtube.com/watch?v=xxxxxxxxxxx&list=LL<channelid>
205 |    * or www.youtube.com/playlist?list=LL<channelid>
206 |  * "FL<channelid>"     -- favorites
207 |    * www.youtube.com/watch?v=xxxxxxxxxxx&list=FL<channelid>
208 |  * "RDCMUC<channelid>" -- mix for channel
209 |    * www.youtube.com/watch?v=xxxxxxxxxxx&list=RDCMUC<channelid>
210 | 
211 |  * prefixes CL, EL, MQ, TT, WL also seem to have a special meaning
212 | 
213 | ### Other playlist types
214 | 
215 | These take 
216 |  * "TLGG<22chars>"  -- temporary list - redir from `watch_videos`
217 |     * When decoded, the last 8 bytes are digits for the "ddmmyyyy" date.
218 |  * "RDEM<22chars>" -- radio channel
219 |    * 22chars is NOT a channel-id
220 |    * www.youtube.com/watch?v=xxxxxxxxxxx&list=RDEM<22chars>
221 |  * "RD<videoid>"  -- mix for a specific video.
222 |  * "OLAK5uy_<33chars>"   -- album playlist.
223 |    * id's start with: `klmn`  : 0b1001xx
224 |    * id's ends with: `AEIMQUYcgkosw048`  --> 2 + 31x6 + 4 = 192 bits
225 |    * www.youtube.com/playlist?list=OLAK5uy_<33chars>
226 |  * "WL"           -- 'watch later'
227 |    * www.youtube.com/playlist?list=WL
228 |    * www.youtube.com/watch?v=xxxxxxxxxxx&list=WL
229 |  * "UL"        -- channel video mix
230 |    * www.youtube.com/watch?v=<11charsvidid>&list=ULxxxxxxxxxxx
231 |    * This works only when there are exactly 11 characters after 'UL'
232 |  * "LM"        -- music.youtube likes
233 |  * "RDMM"      -- music.youtube your mix
234 |  * "RDAMVM<videoid>"      -- music.youtube band mix
235 |  * "RDAO<22chars>"
236 |  * "RDAMPL" + prefix+playlistid
237 |  * "RDCLAK5uy_" + 33chars
238 |  * "RDTMAK5uy_" + 33chars
239 | 
240 |  * prefixes EL, CL also seem to have a special meaning.
241 | 
242 | 
243 | ### post id's
244 | 
245 |  * 26 characters: Ug<17chars>4AaABCQ
246 |    * id's start with [wxyz]  : 0b1100xx
247 |    * id's end with [BFJNRVZdhlptx159]  : 0bxxxx01
248 |      -> 2 + 15*6 + 4  = 96 bits
249 | 
250 | # Youtube url's
251 | 
252 | Domains:
253 | 
254 |     youtu.be
255 |     youtube.com
256 | 
257 | UrlPath:
258 | 
259 |     /watch?v=<videoid>&t=123s&list=<listid>
260 |     /v/<videoid>
261 |     /embed/<videoid>
262 |     /embed/videoseries?list=<playlistid>
263 |     /watch/<videoid>
264 |     /playlist?list=<playlistid>
265 |     /channel/<channelid>
266 |     /user/<username>
267 |     /watch_videos?video_ids=<videoid>,<videoid>,...
268 | 
269 | # protoc
270 | 
271 | Some id's are base64 encoded protobuf packets, like: clickTrackingParams, continuation.
272 | 
273 | 
274 | # Research tool
275 | 
276 | I added a tool: `ytdump.py`, which i use to investigate youtube json dictionaries.
277 | 
278 | # TODO
279 | 
280 |  * DONE extract 'listid' from video links for playlist view.
281 |  * DONE list a channel's video's
282 |  * DONE list a user's video's
283 |  * handle radio links
284 |  * DONE extract live-chat comments
285 |  * Filter out duplicates from the livechat replay dump.
286 |  * DONE make my tool work with an actual live chat.
287 |  * DONE youtube search results.
288 |  * generalize the way continuations are used.
289 |  * add upload date and duration in the video lists.
290 |  * DONE automatically update the innertubeapikey and clientversion
291 |  * get original filename from studio.youtube.com/video/<videoid>/edit
292 |  * playlist editor / organiser
293 |  * community post listing
294 |  * list all on video messages, like cards, etc.
295 |  * list video markers, like in https://www.youtube.com/watch?v=i2KdE-cYMJk
296 |  * list other videos from the same channel.
297 |  * add time, likes to comments
298 |  * repair the `--replay` option.
299 | 
300 | 
301 | # AUTHOR
302 | 
303 | Willem Hengeveld <itsme@xs4all.nl>
304 | 
305 | 


--------------------------------------------------------------------------------
/yttool.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | """
   3 | A tool for extracting useful information from youtube video's, like comments, or subtitles.
   4 | 
   5 | Author: Willem Hengeveld <itsme@xs4all.nl>
   6 | 
   7 | """
   8 | 
   9 | import urllib.request
  10 | import urllib.parse
  11 | import http.cookiejar
  12 | import re
  13 | import json
  14 | import sys
  15 | import html
  16 | import datetime
  17 | from collections import defaultdict
  18 | from xml.parsers.expat import ParserCreate
  19 | 
  20 | import http.client
  21 | 
  22 | 
  23 | def load_socks_proxy(proxyarg):
  24 |     m = re.match(r'(?:(\w+)://)?(\S+):(\d+)', proxyarg)
  25 |     if not m:
  26 |         return
  27 |     method, host, port = m.groups()
  28 |     port = int(port)
  29 | 
  30 |     if not method or not method.startswith('socks'):
  31 |         return
  32 | 
  33 |     import socks
  34 |     socks.setdefaultproxy(socks.SOCKS4 if method.startswith('socks4') else socks.SOCKS5, host, port)
  35 |     def create_connection(address, timeout=None, source_address=None):
  36 |         sock = socks.socksocket()
  37 |         sock.connect(address)
  38 |         return sock
  39 |     import socket
  40 |     socket.create_connection = create_connection
  41 |     socket.socket = socks.socksocket
  42 | 
  43 | 
  44 | def decode_proxy(proxyarg):
  45 |     if m:= re.match(r'(?:(\w+)://)?(\S+):(\d+)', proxyarg):
  46 |         method, host, port = m.groups()
  47 |         port = int(port)
  48 |         if not method or method.startswith('http'):
  49 |             return { 'http': proxyarg, 'https': proxyarg }
  50 | 
  51 | def cvdate(txt):
  52 |     """
  53 |     Convert a string with a date in ymd format to a date object.
  54 |     """
  55 |     ymd = txt.split("-")
  56 |     if len(ymd)!=3:
  57 |         print("WARNING: invalid date format: %s" % txt)
  58 |         return
  59 |     y, m, d = [int(_) for _ in ymd]
  60 |     return datetime.date(y, m, d)
  61 | 
  62 | 
  63 | def cvseconds(txt):
  64 |     """
  65 |     Convert string containing a number of seconds to a timedelta object.
  66 |     """
  67 |     return datetime.timedelta(seconds=int(txt))
  68 | 
  69 | 
  70 | def getitembymember(a, member):
  71 |     """
  72 |     Get the first item from 'a' which has an element named 'member'
  73 |     """
  74 |     for item in a:
  75 |         if member in item:
  76 |             return item
  77 | 
  78 | 
  79 | def getitem(d, *path):
  80 |     """
  81 |     Traverse a nested python object, path items select which object is selected:
  82 |      * a tuple: selects a dictionary from a list which contains the specified key
  83 |      * an integer: select the specified item from a list.
  84 |      * a string: select the specified item from a dictionary.
  85 |     """
  86 |     for k in path:
  87 |         if d is None:
  88 |             return
  89 |         if type(k) == tuple:
  90 |             d = getitembymember(d, *k)
  91 |         elif type(k) == int:
  92 |             d = d[k]
  93 |         else:
  94 |             d = d.get(k)
  95 | 
  96 |     return d
  97 | 
  98 | def extracttext(entry):
  99 |     return entry.get("simpleText") or "".join(r.get('text', "") for r in entry.get("runs"))
 100 | 
 101 | 
 102 | def getcontinuation(p):
 103 |     cont = getitem(p, "contents", 0, "continuationItemRenderer")
 104 |     if cont:
 105 |         return cont
 106 | 
 107 |     p = getitem(p, "continuations", 0, "nextContinuationData")
 108 |     if p:
 109 |         return p["continuation"], p["clickTrackingParams"]
 110 | 
 111 | 
 112 | class Youtube:
 113 |     """
 114 |     Class which knows how to get information from youtune video's
 115 |     """
 116 |     def __init__(self, args):
 117 |         self.args = args
 118 |         cj = http.cookiejar.CookieJar()
 119 |         cj.set_cookie(http.cookiejar.Cookie(version=0, name="CONSENT", value="YES+cb.20210420-15-p1.en+FX+374", port=None, port_specified=False, domain=".youtube.com", domain_specified=True, domain_initial_dot=True, path="/", path_specified=True, secure=False, expires=None, discard=False, comment=None, comment_url=None, rest={}))
 120 | 
 121 |         self.cp = urllib.request.HTTPCookieProcessor(cj)
 122 | 
 123 |         handlers = [self.cp]
 124 |         if args.proxy:
 125 |             proxies = decode_proxy(args.proxy)
 126 |             if proxies:
 127 |                 handlers.append(urllib.request.ProxyHandler(proxies))
 128 |         if args.debug:
 129 |             handlers.append(urllib.request.HTTPSHandler(debuglevel=1))
 130 |         self.opener = urllib.request.build_opener(*handlers)
 131 |         self.innertubeapikey = "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"   # "INNERTUBE_API_KEY": "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
 132 |         self.clientname =      "1"                                         # "INNERTUBE_CONTEXT_CLIENT_NAME": 1,
 133 |         self.clientversion =  "2.20210422.04.00"                           # "INNERTUBE_CONTEXT_CLIENT_VERSION": "2.20210404.08.00",
 134 |         self.idtoken = "QUFFLUhqa1oySl9mbm9mODhfdENjQWdDcENvazM2RS1qZ3w="  # "ID_TOKEN": "QUFFLUhqa1oySl9mbm9mODhfdENjQWdDcENvazM2RS1qZ3w=",
 135 | 
 136 |         html = self.httpreq("https://www.youtube.com/")
 137 |         cfg = self.getytcfg(html.decode('utf-8'))
 138 | 
 139 |         self.innertubeapikey = cfg.get("INNERTUBE_API_KEY")
 140 |         self.clientname =      cfg.get("INNERTUBE_CONTEXT_CLIENT_NAME")
 141 |         self.clientversion =   cfg.get("INNERTUBE_CONTEXT_CLIENT_VERSION")
 142 |         
 143 | 
 144 |     def httpreq(self, url, data=None):
 145 |         """
 146 |         Does GET or POST request to youtube.
 147 |         """
 148 |         hdrs = {
 149 |             "x-youtube-client-name": "1",
 150 |             "x-youtube-client-version": self.clientversion,
 151 |             #"X-Youtube-Identity-Token": self.idtoken,
 152 |             "User-Agent": "Mozilla/5.0 (Mac) Gecko/20100101 Firefox/76.0",
 153 |         }
 154 |         if type(data)==bytes and data[:1] in (b'{', b'['):
 155 |             hdrs["Content-Type"] = "application/json"
 156 | 
 157 |         req = urllib.request.Request(url, headers=hdrs)
 158 | 
 159 |         kwargs = dict()
 160 |         if data is not None:
 161 |             kwargs["data"] = data
 162 | 
 163 |         response = self.opener.open(req, **kwargs)
 164 |         try:
 165 |             page = response.read()
 166 |         except http.client.IncompleteRead as e:
 167 |             page = e.partial
 168 |             print("EXCEPTION FOUND: http.client.IncompleteRead")
 169 |             pass
 170 | 
 171 |         return page
 172 | 
 173 |     def getcomments(self, cont, xsrf, replies=False):
 174 |         """
 175 |         Returns comments for the specified continuation parameter.
 176 |         """
 177 |         cmd = getitem(cont, "continuationEndpoint") or getitem(cont, "button", "buttonRenderer", "command")
 178 |         url = getitem(cmd, "commandMetadata", "webCommandMetadata", "apiUrl")
 179 |         postreq = {
 180 |             "context":{"client":{"clientName":"WEB","clientVersion":self.clientversion}},
 181 |             "continuation": getitem(cmd, "continuationCommand", "token"),
 182 |         }
 183 | 
 184 |         return self.httpreq("https://www.youtube.com" + url + "?" + urllib.parse.urlencode({"key":self.innertubeapikey}), json.dumps(postreq).encode('utf-8') )
 185 | 
 186 |     def getchat(self, cont, live=False):
 187 |         """
 188 |         Returns chat for the specified continuation parameter.
 189 |         """
 190 |         if live:
 191 |             url = "https://www.youtube.com/live_chat"
 192 |         else:
 193 |             url = "https://www.youtube.com/live_chat_replay"
 194 |         query = {
 195 |             "pbj": 1,
 196 |             "continuation": cont,
 197 |         }
 198 | 
 199 |         return self.httpreq(url + "?" + urllib.parse.urlencode(query))
 200 | 
 201 |     def getchat2(self, cont, offset, live=False):
 202 |         """
 203 |         Returns chat for the specified continuation parameter.
 204 |         """
 205 |         if live:
 206 |             url = "https://www.youtube.com/youtubei/v1/live_chat_replay/get_live_chat"
 207 |         else:
 208 |             url = "https://www.youtube.com/youtubei/v1/live_chat_replay/get_live_chat_replay"
 209 |         query = {
 210 |             "pbj": 1,
 211 |             "continuation": cont,
 212 |             "playerOffsetMs": offset,
 213 |             "hidden": False,
 214 |             "commandMetadata": "[object Object]",
 215 |         }
 216 | 
 217 |         return self.httpreq(url + "?" + urllib.parse.urlencode(query))
 218 | 
 219 |     def getlivechat(self, cont):
 220 |         url = "https://www.youtube.com/youtubei/v1/live_chat/get_live_chat"
 221 |         query = { "key": self.innertubeapikey, }
 222 |         postdata = {
 223 |             "context": { "client": {   "clientName": "WEB", "clientVersion": self.clientversion } },
 224 |             "continuation": cont
 225 |         }
 226 | 
 227 |         return self.httpreq(url + "?" + urllib.parse.urlencode(query), json.dumps(postdata).encode('utf-8'))
 228 | 
 229 | 
 230 |     def getsearch(self, cont):
 231 |         """
 232 |         Returns next batch of search results
 233 |         """
 234 |         url = "https://www.youtube.com/youtubei/v1/search"
 235 |         query = {
 236 |             "key": self.innertubeapikey
 237 |         }
 238 |         postdata = {
 239 |             "context": { "client": {   "clientName": "WEB", "clientVersion": self.clientversion } },
 240 |             "continuation": cont,
 241 |         }
 242 |         postdata = json.dumps(postdata)
 243 |         return self.httpreq(url + "?" + urllib.parse.urlencode(query), postdata.encode('ascii'))
 244 | 
 245 |     def browse(self, cont):
 246 |         """
 247 |         Returns videos for the specified continuation parameter.
 248 |         """
 249 |         cmd = getitem(cont, "continuationEndpoint")
 250 |         url = getitem(cmd, "commandMetadata", "webCommandMetadata", "apiUrl")
 251 |         postreq = {
 252 |             "context":{"client":{"clientName":"WEB","clientVersion":self.clientversion}},
 253 |             "continuation": getitem(cmd, "continuationCommand", "token"),
 254 |         }
 255 | 
 256 |         return self.httpreq("https://www.youtube.com" + url + "?" + urllib.parse.urlencode({"key":self.innertubeapikey}), json.dumps(postreq).encode('utf-8') )
 257 | 
 258 | 
 259 |     def getpageinfo(self, yturl):
 260 |         """
 261 |         Returns the youtube configuration object.
 262 |         """
 263 |         ytcfgtext = self.httpreq(yturl + ("&" if yturl.find('?')>=0 else "?") + "pbj=1")
 264 |         if self.args.debug:
 265 |             print("============ youtube config")
 266 |             print(ytcfgtext.decode('utf-8'))
 267 |             print()
 268 | 
 269 |         try:
 270 |             return json.loads(ytcfgtext.lstrip(b")]}'"))
 271 |         except Exception as e:
 272 |             if self.args.verbose:
 273 |                 print("EXCEPTION in getpageinfo: %s" % e)
 274 |             if self.args.debug:
 275 |                 raise
 276 |             return
 277 | 
 278 |     def getytcfg(self, ythtml):
 279 |         ytcfg = {}
 280 |         for m in re.finditer(r'ytcfg\.set\((\{.*?\})\)', ythtml):
 281 |             jsontxt = m.group(1).replace("'", '"').replace('",}', '"}')
 282 |             ytcfg.update(json.loads(jsontxt))
 283 |         return ytcfg
 284 | 
 285 | 
 286 |     def getconfigfromhtml(self, ythtml):
 287 |         """
 288 |         Alternative method of extracting the config object.
 289 |         By parsing the html page returned by youtube.
 290 |         """
 291 |         if self.args.debug:
 292 |             print("============ youtube page")
 293 |             print(ythtml.decode('utf-8'))
 294 |             print()
 295 | 
 296 |         m = re.search(br'ytplayer.config = (.*?);ytplayer.load', ythtml)
 297 |         if not m:
 298 |             print("could not find config")
 299 |             return
 300 |         cfgtext = m.group(1)
 301 |         if self.args.debug:
 302 |             print("========== config json")
 303 |             print(cfgtext.decode('utf-8'))
 304 |             print()
 305 | 
 306 |         cfg = json.loads(cfgtext)
 307 |         
 308 |         playertext = cfg['args']['player_response']
 309 |         if self.args.debug:
 310 |             print("========== player json")
 311 |             print(playertext)
 312 |             print()
 313 |         return json.loads(playertext)
 314 | 
 315 |     def extractsearchconfig(self, html):
 316 |         if self.args.debug:
 317 |             print("============ youtube page")
 318 |             print(html.decode('utf-8'))
 319 |             print()
 320 |         m = re.search(br'window["ytInitialData"] = (.*);', html)
 321 |         if not m:
 322 |             print("could not find config")
 323 |             return
 324 |         cfgtext = m.group(1)
 325 |         if self.args.debug:
 326 |             print("========== config json")
 327 |             print(cfgtext.decode('utf-8'))
 328 |             print()
 329 | 
 330 |         return json.loads(cfgtext)
 331 | 
 332 | def strunescape(txt):
 333 |     txt = re.sub(r'\\x(\w\w)', lambda m:chr(int(m.group(1), 16)), txt)
 334 |     txt = re.sub(r'\\n', "\n", txt)
 335 |     txt = re.sub(r'\\r', "\r", txt)
 336 |     txt = re.sub(r'\\t', "\t", txt)
 337 |     txt = re.sub(r'\\/', "/", txt)
 338 |     return txt
 339 | 
 340 | def filterhtml(html):
 341 |     """
 342 |     extract 4 different dictionaries from the html page.
 343 |     -- ytInitialPlayerResponse
 344 |     -- ytcfg.set()
 345 |     -- ytplayer.web_player_context_config
 346 |     -- ytInitialData 
 347 |     """
 348 | 
 349 |     result = {}
 350 |     for m in re.finditer(r'ytcfg\.set\(([^{}]*?),([^{}]*?)\)', html):
 351 |         #print("yt1", m.groups())
 352 |         pass
 353 |     result["ytcfg"] = {}
 354 |     for m in re.finditer(r'ytcfg\.set\((\{.*?\})\)', html):
 355 |         #print("yt2", m.group(1))
 356 |         jsontxt = m.group(1).replace("'", '"').replace('",}', '"}')
 357 |         result["ytcfg"].update(json.loads(jsontxt))
 358 |         # TIMING_INFO.cver: "2.20210111.08.00",
 359 | 
 360 |     if m := re.search(r'<script type="application/ld\+json"[^>]*>(\{.*?\})</script>', html):
 361 |         #print("ld", m.group(1))
 362 |         result["ldjson"] = json.loads(strunescape(m.group(1)))
 363 | 
 364 |     if m := re.search(r'ytplayer.web_player_context_config = (\{.*?\});', html):
 365 |         #print("cfg", m.group(1))
 366 |         result["playercg"] = json.loads(m.group(1))
 367 |         # device.interfaceVersion: "2.20210111.08.00",
 368 |         # "innertubeApiKey": "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8",
 369 |         # "innertubeContextClientVersion": "2.20210111.08.00",
 370 | 
 371 |     if m := re.search(r'setMessage\((\{.*?\})\);', html):
 372 |         #print("msg", m.group(1))
 373 |         result["msg"] = json.loads(m.group(1))
 374 | 
 375 |     if m := re.search(r'<script[^>]*>var ytInitialPlayerResponse = (\{.*?\});', html):
 376 |         #print("initplayer", m.group(1))
 377 |         result["initplayer"] = json.loads(m.group(1))
 378 |         # note: this is the same as pbj.[].playerResponse
 379 | 
 380 |     if m := re.search(r'<script[^>]*>var ytInitialData = (\{.*?\});</script>', html):
 381 |         #print("initdata", m.group(1))
 382 |         result["initdata"] = json.loads(m.group(1))
 383 |         # note: this is the same as pbj.[].response
 384 |     if m := re.search(r'<script[^>]*>window\["ytInitialData"\] = (\{.*?\});</script>', html):
 385 |         #print("initdata", m.group(1))
 386 |         result["initdata"] = json.loads(m.group(1))
 387 |         # note: this is the same as pbj.[].response
 388 | 
 389 | 
 390 |     return result
 391 | 
 392 | 
 393 | 
 394 | class LivechatReader:
 395 |     """
 396 |     class reads a livechat or livechat replay.
 397 |     """
 398 |     def __init__(self, args, yt, cfg, live=False):
 399 |         self.args = args
 400 |         self.yt = yt
 401 |         self.live = live
 402 |         self.cont = self.getchatinfo(cfg)
 403 | 
 404 |     def getcontinuation(self, p):
 405 |         p = getitem(p, "continuations", 0, "reloadContinuationData")
 406 |         if not p:
 407 |             return
 408 |         return p["continuation"]
 409 | 
 410 |     def getchatinfo(self, cfg):
 411 |         """
 412 |         Find the base parameters for querying the video's comments.
 413 | 
 414 |         """
 415 |         item = getitem(cfg, "initdata", "contents", "twoColumnWatchNextResults", "conversationBar", "liveChatRenderer")
 416 |         if not item:
 417 |             return
 418 | 
 419 |         return self.getcontinuation(item)
 420 | 
 421 |     def recursechat(self):
 422 |         if not self.cont:
 423 |             print("no live chat replay found")
 424 |             return
 425 |         ms = 0
 426 |         while True:
 427 |             #cmtjson = self.yt.getchat2(self.cont, ms, self.live)
 428 |             cmtjson = self.yt.getchat(self.cont, self.live)
 429 |             if self.args.debug:
 430 |                 print("============ chat req")
 431 |                 print(cmtjson.decode('utf-8'))
 432 |                 print()
 433 |             if cmtjson.startswith(b"<!DOCTYPE"):
 434 |                 js = filterhtml(cmtjson.decode('utf-8'))
 435 |                 if self.args.debug:
 436 |                     print("============ chat req extracted json")
 437 |                     print(json.dumps(js))
 438 |                     print()
 439 |             else:
 440 |                 js = json.loads(cmtjson)
 441 | 
 442 |             cmtlist, newms = self.extractchat(js["initdata"]) 
 443 |             if newms==ms:
 444 |                 break
 445 | 
 446 |             for author, time, comment in cmtlist:
 447 |                 print("--->", time, author)
 448 |                 print(extracttext(comment))
 449 | 
 450 |             ms = newms
 451 | 
 452 |         print("========== live ===========")
 453 | 
 454 |         self.monitorchat(js["initdata"])
 455 | 
 456 | 
 457 |     def extractchat(self, js):
 458 |         actions = getitem(js, "continuationContents", "liveChatContinuation", "actions")
 459 |         if not actions:
 460 |             return [], None
 461 | 
 462 |         cmtlist = []
 463 |         ms = None
 464 | 
 465 |         def addchatitem(item):
 466 |             msg = getitem(item, "message")
 467 |             author = getitem(item, "authorName", "simpleText")
 468 |             time = getitem(item, "timestampText", "simpleText")
 469 |             if time is None:
 470 |                 timeusec = getitem(item, "timestampUsec")
 471 |                 if timeusec is not None:
 472 |                     dt = datetime.datetime.fromtimestamp(int(timeusec)/1000000)
 473 |                     time = dt.strftime("%Y-%m-%d %H:%M:%S")
 474 | 
 475 |             cmtlist.append((author, time, msg))
 476 | 
 477 |         for act in actions:
 478 |             replayactions = getitem(act, "replayChatItemAction", "actions")
 479 |             ms = getitem(act, "replayChatItemAction", "videoOffsetTimeMsec")
 480 | 
 481 |             if replayactions:
 482 |                 for ract in replayactions:
 483 |                     item = getitem(ract, "addChatItemAction", "item", "liveChatTextMessageRenderer")
 484 |                     if item:
 485 |                         addchatitem(item)
 486 | 
 487 |             item = getitem(act, "addChatItemAction", "item", "liveChatTextMessageRenderer")
 488 |             if item:
 489 |                 addchatitem(item)
 490 | 
 491 |         return cmtlist, ms
 492 | 
 493 |     def monitorchat(self, js):
 494 |         while True:
 495 |             cont = getitem(js, "continuationContents", "liveChatContinuation", "continuations", 0, "invalidationContinuationData", "continuation")
 496 |             respjson = self.yt.getlivechat(cont)
 497 |             if self.args.debug:
 498 |                 print("============ comment req")
 499 |                 print(respjson.decode('utf-8'))
 500 |                 print()
 501 |             js = json.loads(respjson)
 502 | 
 503 |             cmtlist, newms = self.extractchat(js) 
 504 | 
 505 |             for author, time, comment in cmtlist:
 506 |                 print("--->", time, author)
 507 |                 print(extracttext(comment))
 508 |             sys.stdout.flush()
 509 | 
 510 |             import time
 511 |             time.sleep(1)
 512 | 
 513 | 
 514 | 
 515 | class CommentReader:
 516 |     """
 517 |     class which can recursively print comments
 518 |     """
 519 |     def __init__(self, args, yt, cfg):
 520 |         self.args = args
 521 |         self.yt = yt
 522 |         self.contclick, self.xsrf = self.getcommentinfo(cfg)
 523 | 
 524 |     def recursecomments(self, cc=None, level=0):
 525 |         if not cc and not level:
 526 |             cc = self.contclick
 527 |         while cc:
 528 |             cmtjson = self.yt.getcomments(cc, self.xsrf, replies=(level>0))
 529 |             if self.args.debug:
 530 |                 print("============ comment req")
 531 |                 print(cmtjson.decode('utf-8'))
 532 |                 print()
 533 | 
 534 |             if not cmtjson:
 535 |                 raise Exception("empty response")
 536 | 
 537 |             js = json.loads(cmtjson)
 538 | 
 539 |             if type(js)==list:
 540 |                 # this is for 'replies', which return an array instead of a dict as the top-level response.
 541 |                 js = getitem(js, ("response",))
 542 | 
 543 |             cmtlist, cc = self.extractcomments(js) 
 544 | 
 545 |             for author, when, comment, likes, replies, subcc in cmtlist:
 546 |                 if self.args.verbose:
 547 |                     print("---" * (level+1) + ">", "%s ; %s ; %s likes ; %s replies" % (author, when, likes, replies))
 548 |                 else:
 549 |                     print("---" * (level+1) + ">", author)
 550 |                 print(extracttext(comment))
 551 |                 if subcc:
 552 |                     self.recursecomments(subcc, level+1)
 553 | 
 554 |     def getcommentinfo(self, cfg):
 555 |         """
 556 |         Find the base parameters for querying the video's comments.
 557 | 
 558 |         """
 559 |         item = getitem(cfg, "initdata", "contents", "twoColumnWatchNextResults", "results", "results", "contents")
 560 |         cont = getcontinuation(getitem(item, ("itemSectionRenderer",), "itemSectionRenderer")) 
 561 |         xsrftoken = getitem(cfg, "ytcfg", "XSRF_TOKEN")
 562 |         xsrffield = getitem(cfg, "ytcfg", "XSRF_FIELD_NAME")
 563 | 
 564 |         xsrfdict = { xsrffield: xsrftoken } if xsrftoken else {}
 565 | 
 566 |         return cont, xsrfdict 
 567 | 
 568 |     def getcomment(self, p):
 569 |         """
 570 |         Return info for a single comment.
 571 |         """
 572 |         if "commentThreadRenderer" in p:
 573 |             p = p["commentThreadRenderer"]
 574 | 
 575 |         c = p
 576 |         r = p
 577 |         if "comment" in c:
 578 |             c = c["comment"]
 579 |         if "commentRenderer" in c:
 580 |             c = c["commentRenderer"]
 581 |         if "replies" in r:
 582 |             r = r["replies"]
 583 | 
 584 |         author = getitem(c,  "authorText", "simpleText")
 585 |         content = getitem(c,  "contentText")
 586 |         likes = getitem(c, "likeCount")
 587 |         nrreplies = getitem(c, "replyCount")
 588 |         when = extracttext(getitem(c,  "publishedTimeText"))
 589 |         replies = getitem(r,  "commentRepliesRenderer")
 590 |         if replies:
 591 |             cont = getcontinuation(replies)
 592 |         else:
 593 |             cont = None
 594 | 
 595 |         return author, when, content, int(likes or 0), int(nrreplies or 0), cont
 596 | 
 597 |     def extractcomments(self, js):
 598 |         """
 599 |         Extract a list of comments from comment dictionary
 600 |         """
 601 |         endpoints = getitem(js, "onResponseReceivedEndpoints")
 602 |         if not endpoints:
 603 |             return [], None
 604 | 
 605 |         cc = None
 606 |         cmtlist = []
 607 |         for p in endpoints:
 608 |             items = getitem(p, "reloadContinuationItemsCommand", "continuationItems")
 609 |             if not items:
 610 |                 items = getitem(p, "appendContinuationItemsAction", "continuationItems")
 611 |                 if not items:
 612 |                     continue
 613 |             for p in items:
 614 |                 c = getitem(p, "commentThreadRenderer")
 615 |                 if c:
 616 |                     cmtlist.append(self.getcomment(c))
 617 |                 c = getitem(p, "commentRenderer")
 618 |                 if c:
 619 |                     cmtlist.append(self.getcomment(c))
 620 |                 c = getitem(p, "continuationItemRenderer")
 621 |                 if c:
 622 |                     cc = c
 623 | 
 624 |         return cmtlist, cc
 625 | 
 626 | 
 627 | class SearchReader:
 628 |     def __init__(self, args, yt, cfg):
 629 |         self.args = args
 630 |         self.yt = yt
 631 |         self.cfg = cfg
 632 | 
 633 |     def getresults(self, js):
 634 |         ct = getitem(js, "contents", "twoColumnSearchResultsRenderer", "primaryContents", "sectionListRenderer", "contents")
 635 |         if not ct:
 636 |             ct = getitem(js, "onResponseReceivedCommands", 0, "appendContinuationItemsAction", "continuationItems")
 637 | 
 638 |         resultlist = getitem(ct, ("itemSectionRenderer",), "itemSectionRenderer", "contents")
 639 |         cont = getitem(ct, ("continuationItemRenderer",), "continuationItemRenderer", "continuationEndpoint", "continuationCommand", "token")
 640 | 
 641 |         return resultlist, cont
 642 | 
 643 |     def recursesearch(self):
 644 | 
 645 |         resultlist, cont = self.getresults(getitem(self.cfg, "initdata"))
 646 |         while True:
 647 |             for item in resultlist:
 648 |                 if video := item.get("videoRenderer"):
 649 |                     vid = getitem(video, "videoId")
 650 |                     pub = getitem(video, "publishedTimeText", "simpleText")
 651 |                     title = getitem(video, "title")
 652 |                     # title -> runs
 653 |                     # descriptionSnippet -> runs
 654 |                     # publishedTimeText -> simpleText
 655 |                     # lengthText -> simpleText
 656 |                     # viewCountText -> simpleText
 657 |                     # ownerText -> runs
 658 |                     print("%s - %s" % (vid, extracttext(title)))
 659 |                 elif chan := item.get("channelRenderer"):
 660 |                     cid = getitem(chan, "channelId")
 661 |                     title = getitem(chan, "title", "simpleText")
 662 |                     # "videoCountText" -> runs
 663 |                     # subscriberCountText -> simpleText
 664 |                     # descriptionSnippet -> runs
 665 |                     print("%s - %s" % (cid, title))
 666 | 
 667 |             jstext = self.yt.getsearch(cont)
 668 |             js = json.loads(jstext)
 669 |             resultlist, cont = self.getresults(js)
 670 | 
 671 | 
 672 | class DetailReader:
 673 |     """
 674 |     Extract some details for a video from the config.
 675 |     """
 676 |     def __init__(self, args, yt, cfg):
 677 |         self.args = args
 678 |         self.yt = yt
 679 |         self.cfg = cfg
 680 | 
 681 |     def output(self):
 682 |         vd = getitem(self.cfg, "initplayer", "videoDetails")
 683 |         mf = getitem(self.cfg, "initplayer", "microformat", "playerMicroformatRenderer")
 684 |         twocol = getitem(self.cfg, "initdata", "contents", "twoColumnWatchNextResults", "results", "results", "contents")
 685 |         sentiment = getitem(twocol, ("videoPrimaryInfoRenderer",), "videoPrimaryInfoRenderer", "sentimentBar", "sentimentBarRenderer", "tooltip")
 686 | 
 687 |         if not mf:
 688 |             print("microformat not found")
 689 |             return
 690 | 
 691 |         vc = int(mf.get("viewCount"))
 692 |         ls = cvseconds(mf.get("lengthSeconds"))
 693 |         pd = cvdate(mf.get("publishDate"))
 694 |         ud = cvdate(mf.get("uploadDate"))
 695 |         desc = getitem(mf, "description", "simpleText")
 696 | 
 697 |         vid = vd.get("videoId")
 698 | 
 699 |         title = getitem(mf, "title", "simpleText")
 700 |         owner = getitem(mf, "ownerChannelName")
 701 | 
 702 |         print("%s - %s" % (vid, title))
 703 |         print("By: %s" % (owner))
 704 |         print()
 705 |         print("viewcount: %d, length: %s, sentiment: %s, published: %s%s" % (vc, ls, sentiment, pd, "" if pd==ud else ", uploaded at: %s" % ud))
 706 |         print()
 707 |         print("%s" % desc)
 708 |         print()
 709 | 
 710 | 
 711 | class SubtitleReader:
 712 |     """
 713 |     class which can print a video's subtitles
 714 |     """
 715 |     def __init__(self, args, yt, cfg):
 716 |         self.args = args
 717 |         self.yt = yt
 718 |         self.cfg = cfg
 719 | 
 720 |     def languagematches(self, language, ct):
 721 |         """
 722 |         Match a captionTrack record to the language filter.
 723 |         """
 724 |         if language == 'asr' and ct.get('kind') == 'asr':
 725 |             return True
 726 |         if ct["name"]["simpleText"] == language:
 727 |             return True
 728 |         if ct["languageCode"] == language:
 729 |             return True
 730 | 
 731 |     def output(self):
 732 |         js = getitem(self.cfg, "initplayer")
 733 |         p = getitem(js, "captions", "playerCaptionsTracklistRenderer", "captionTracks")
 734 |             
 735 |         if not p:
 736 |             print("no subtitles found")
 737 |             return
 738 | 
 739 |         captiontracks = p
 740 | 
 741 |         # filter subtitles based on language
 742 |         if self.args.language:
 743 |             captiontracks = self.filtertracks(self.args.language, captiontracks)
 744 | 
 745 |         for ct in captiontracks:
 746 |             if len(captiontracks) > 1:
 747 |                 print("###  %s ###" % ct["name"]["simpleText"])
 748 | 
 749 |             self.outputsubtitles(ct["baseUrl"])
 750 | 
 751 |             if len(captiontracks) > 1:
 752 |                 print()
 753 | 
 754 |     def filtertracks(self, language, captiontracks):
 755 |         matchedtracks = defaultdict(list)
 756 |         for ct in captiontracks:
 757 |             if not self.languagematches(language, ct):
 758 |                 continue
 759 | 
 760 |             matchedtracks[ct["languageCode"]].append(ct)
 761 | 
 762 |         filteredlist = []
 763 |         for lang, tracks in matchedtracks.items():
 764 |             if len(tracks) > 1:
 765 |                 # prefer non automated translation
 766 |                 tracks = filter(lambda ct:ct.get("kind") != "asr", tracks)
 767 |             filteredlist.extend(tracks)
 768 | 
 769 |         return filteredlist
 770 | 
 771 |     def outputsubtitles(self, cturl):
 772 |         ttxml = self.yt.httpreq(cturl)
 773 |         if self.args.debug:
 774 |             print("========== timedtext xml")
 775 |             print(ttxml.decode('utf-8'))
 776 |             print()
 777 |         tt = self.extractxmltext(ttxml)
 778 | 
 779 |         if self.args.srt:
 780 |             self.output_srt(tt)
 781 |         elif self.args.verbose:
 782 |             for t0, t1, txt in tt:
 783 |                 print("%s  %s" % (self.formattime(t0), txt))
 784 |         else:
 785 |             for t0, t1, txt in tt:
 786 |                 print(txt)
 787 | 
 788 |     @staticmethod
 789 |     def formattime(t):
 790 |         m = int(t/60) ; t -= 60*m
 791 |         h = int(m/60) ; m -= 60*h
 792 |         return "%d:%02d:%06.3f" % (h, m, t)
 793 | 
 794 |     @staticmethod
 795 |     def srttime(t):
 796 |         return SubtitleReader.formattime(t).replace('.', ',')
 797 | 
 798 |     @staticmethod
 799 |     def output_srt(tt):
 800 |         n = 1
 801 |         for t0, t1, txt in tt:
 802 |             print(n)
 803 |             print("%s --> %s" % (SubtitleReader.srttime(t0), SubtitleReader.srttime(t1)))
 804 |             print(txt)
 805 |             print()
 806 | 
 807 |     @staticmethod
 808 |     def unhtml(htmltext):
 809 |         """
 810 |         Removes html font tags, and decodes html entities
 811 |         """
 812 |         return html.unescape(re.sub(r'</?font[^>]*>', '', htmltext))
 813 | 
 814 |     def extractxmltext(self, xml):
 815 |         """
 816 |         Returns a list of tuples: time, endtime, text
 817 |         """
 818 |         lines = []
 819 |         tstart = None
 820 |         tend = None
 821 |         text = None
 822 |         def handle_begin_element(elem, attr):
 823 |             nonlocal text, tstart, tend
 824 |             if elem == 'text':
 825 |                 text = ""
 826 |                 tstart = float(attr.get('start'))
 827 |                 tend = tstart + float(attr.get('dur'))
 828 | 
 829 |         def handle_end_element(elem):
 830 |             nonlocal text
 831 |             if elem == 'text':
 832 |                 lines.append((tstart, tend, self.unhtml(text)))
 833 |                 text = None
 834 |         def handle_data(data):
 835 |             nonlocal text
 836 |             if text is not None:
 837 |                 text += data
 838 | 
 839 |         parser = ParserCreate()
 840 |         parser.StartElementHandler = handle_begin_element
 841 |         parser.EndElementHandler = handle_end_element
 842 |         parser.CharacterDataHandler = handle_data
 843 |         parser.Parse(xml, 1)
 844 | 
 845 |         return lines
 846 | 
 847 | 
 848 | class PlaylistReader:
 849 |     """
 850 |     class which can print a playlist's contents.
 851 |     """
 852 |     def __init__(self, args, yt, cfg):
 853 |         self.args = args
 854 |         self.yt = yt
 855 |         self.cfg = cfg
 856 | 
 857 |     def output(self):
 858 |         # ==== [                  'playlistVideoRenderer', 1, 'contents', 'playlistVideoListRenderer', 0, 'contents', 'itemSectionRenderer', 0, 'contents', 'sectionListRenderer', 'content', 'tabRenderer', 0, 'tabs', 'twoColumnBrowseResultsRenderer', 'contents', 'response', 1]
 859 |         # ==== ['gridVideoRenderer', 1, 'items', 'horizontalListRenderer', 'content', 'shelfRenderer', 0, 
 860 |         #                     'contents', 'itemSectionRenderer', 1, 'contents', 'sectionListRenderer', 'content', 'tabRenderer', 0, 
 861 |         #                     'tabs', 'twoColumnBrowseResultsRenderer', 'contents', 'response', 1]
 862 |         playlist = getitem(self.cfg, "initdata", "contents", "twoColumnWatchNextResults", "playlist")
 863 |         if playlist:
 864 |             print("Title: %s" % getitem(playlist, "playlist", "title"))
 865 |             for entry in getitem(playlist, "playlist", "contents"):
 866 |                 vid = getitem(entry, "playlistPanelVideoRenderer", "videoId")
 867 |                 title = getitem(entry, "playlistPanelVideoRenderer", "title", "simpleText")
 868 |                 length = getitem(entry, "playlistPanelVideoRenderer", "lengthText", "simpleText")
 869 |                 if args.verbose:
 870 |                     print("%s - %s  %s" % (vid, length, title))
 871 |                 else:
 872 |                     print("%s - %s" % (vid, title))
 873 |             return
 874 |         tabs = getitem(self.cfg, "initdata", "contents", "twoColumnBrowseResultsRenderer", "tabs", 0, "tabRenderer", "content")
 875 |         ct1 = getitem(tabs, "sectionListRenderer", "contents", 0, "itemSectionRenderer", "contents", 0)
 876 |         playlist = getitem(ct1, "playlistVideoListRenderer")
 877 |         list_tag = "contents"
 878 |         entry_tag = "playlistVideoRenderer"
 879 |         if not playlist:
 880 |             playlist = getitem(ctl, "shelfRenderer", "content", 'horizontalListRenderer')
 881 |             list_tag = "items"
 882 |             entry_tag = "gridVideoRenderer"
 883 |         if playlist:
 884 |             cont = None
 885 |             for entry in playlist[list_tag]:
 886 |                 vid = getitem(entry, entry_tag, "videoId")
 887 |                 title = getitem(entry, entry_tag, "title")
 888 |                 if vid and title:
 889 |                     print("%s - %s" % (vid, extracttext(title)))
 890 |                 c = getitem(entry, "continuationItemRenderer")
 891 |                 if c:
 892 |                     cont = c
 893 | 
 894 |             if not cont:
 895 |                 cont = getcontinuation(playlist)
 896 |             while cont:
 897 |                 browsejson = self.yt.browse(cont)
 898 |                 if self.args.debug:
 899 |                     print("============ browse req")
 900 |                     print(browsejson.decode('utf-8'))
 901 |                     print()
 902 | 
 903 |                 js = json.loads(browsejson)
 904 | 
 905 |                 cont = None
 906 |                 playlist = getitem(js, "initdata", "continuationContents", "gridContinuation")
 907 |                 if playlist:
 908 |                     for entry in getitem(playlist, "items"):
 909 |                         vid = getitem(entry, "gridVideoRenderer", "videoId")
 910 |                         title = getitem(entry, "gridVideoRenderer", "title")
 911 |                         print("%s - %s" % (vid, extracttext(title)))
 912 |                 playlist = getitem(js, "initdata", "continuationContents", "playlistVideoListContinuation")
 913 |                 item_tag = "contents"
 914 |                 if not playlist:
 915 |                     playlist = getitem(js, "initdata", "onResponseReceivedActions", 0, "appendContinuationItemsAction")
 916 |                     item_tag = "continuationItems"
 917 |                 if not playlist:
 918 |                     playlist = getitem(js, "onResponseReceivedActions", 0, "appendContinuationItemsAction")
 919 |                     item_tag = "continuationItems"
 920 |                 if playlist:
 921 |                     for entry in getitem(playlist, item_tag):
 922 |                         vid = getitem(entry, "playlistVideoRenderer", "videoId")
 923 |                         title = getitem(entry, "playlistVideoRenderer", "title")
 924 |                         if vid and title:
 925 |                             print("%s - %s" % (vid, extracttext(title)))
 926 |                         c = getitem(entry, "continuationItemRenderer")
 927 |                         if c:
 928 |                             cont = c
 929 | 
 930 |                 if not playlist:
 931 |                     break
 932 |                 if not cont:
 933 |                     cont = getcontinuation(playlist)
 934 | 
 935 |             return
 936 | 
 937 | 
 938 | def parse_youtube_link(url):
 939 |     """
 940 |     Recognize different types of youtube urls:
 941 | 
 942 |     http://,   https://
 943 | 
 944 |     youtu.be/<videoid>[?list=<listid>]
 945 | 
 946 |     (?:www.)?youtube.com...
 947 | 
 948 |     /channel/<channelid>
 949 |     /c/<channelname>
 950 |     /playlist?list=<listid>
 951 |     /watch?v=<videoid> [&t=pos] [&list=<listid>]
 952 |     /watch/<videoid>
 953 |     /v/<videoid>
 954 |     /embed/<videoid>
 955 |     /user/<username>
 956 |     /watch_videos?video_ids=<videoid>,<videoid>,...
 957 |     /results?search_query=...
 958 |     """
 959 |     m = re.match(r'^(?:https?://)?(?:www\.)?(?:(?:youtu\.be|youtube\.com)/)?(.*)', url)
 960 |     if not m:
 961 |         raise Exception("youtube link not matched")
 962 | 
 963 |     path = m.group(1)
 964 | 
 965 |     if m := re.match(r'^user/([^/?]+)', path):
 966 |         yield 'username', m.group(1)
 967 |     elif m := re.match(r'^(\w+)/([A-Za-z0-9_-]+)(.*)', path):
 968 |         idtype = m.group(1)
 969 |         if idtype in ('v', 'embed', 'watch'):
 970 |             idtype = 'video'
 971 |         elif idtype in ('channel'):
 972 |             idtype = 'channel'
 973 |         elif idtype in ('c'):
 974 |             idtype = 'channelname'
 975 |         elif idtype in ('playlist'):
 976 |             idtype = 'playlist'
 977 |         else:
 978 |             raise Exception("unknown id type")
 979 | 
 980 |         idvalue = m.group(2)
 981 |         yield idtype, idvalue
 982 |         if idtype == 'channel':
 983 |             yield 'playlist', 'UU' + idvalue[2:]
 984 | 
 985 |         idargs = urllib.parse.parse_qs(m.group(3))
 986 |         if idvalue := idargs.get('v'):
 987 |             if idvalue[0]:
 988 |                 yield 'video', idvalue[0]
 989 |         if idvalue := idargs.get('list'):
 990 |             if idvalue[0]:
 991 |                 yield 'playlist', idvalue[0]
 992 | 
 993 |     elif m := re.match(r'^(v|embed|watch|channel|playlist)(?:\?(.*))?$', path):
 994 |         idtype = m.group(1)
 995 |         if idtype in ('v', 'embed', 'watch'):
 996 |             idtype = 'video'
 997 |         elif idtype in ('channel'):
 998 |             idtype = 'channel'
 999 |         elif idtype in ('playlist'):
1000 |             idtype = 'playlist'
1001 | 
1002 |         idargs = urllib.parse.parse_qs(m.group(2))
1003 |         if idvalue := idargs.get('v'):
1004 |             if idvalue[0]:
1005 |                 yield 'video', idvalue[0]
1006 |         if idvalue := idargs.get('list'):
1007 |             if idvalue[0]:
1008 |                 yield 'playlist', idvalue[0]
1009 | 
1010 |     elif m := re.match(r'^results\?(.*)$', path):
1011 |         idargs = urllib.parse.parse_qs(m.group(1))
1012 |         if idvalue := idargs.get('search_query'):
1013 |             if idvalue[0]:
1014 |                 yield 'search', idvalue[0]
1015 | 
1016 |     elif m := re.match(r'^[A-Za-z0-9_-]+$', path):
1017 |         if len(path)==11:
1018 |             yield 'video', path
1019 |         else:
1020 |             yield 'playlist', path
1021 |      
1022 |     else:
1023 |         raise Exception("unknown id")
1024 | 
1025 | def channelurl_from_userpage(cfg):
1026 |     return getitem(cfg, "initdata", "metadata", "channelMetadataRenderer", "channelUrl")
1027 |     # or "initplayer", "microformat", "playerMicroformatRenderer", "externalChannelId"
1028 |     # or "initplayer", "videoDetails", "channelId"
1029 | 
1030 | def check_error(cfg):
1031 |     status = getitem(cfg, "initplayer", "playabilityStatus")
1032 |     if not status:
1033 |         return
1034 |     if status["status"] == "ERROR":
1035 |         print(status["reason"])
1036 |         return True
1037 | 
1038 | def main():
1039 |     import io
1040 |     sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')
1041 | 
1042 |     import argparse
1043 |     parser = argparse.ArgumentParser(description='Extract Youtube comments')
1044 |     parser.add_argument('--debug', '-d', action='store_true', help='print all intermediate steps')
1045 |     parser.add_argument('--verbose', '-v', action='store_true', help='prefix each line with the timestamp')
1046 |     parser.add_argument('--comments', '-c', action='store_true', help='Print video comments')
1047 |     parser.add_argument('--subtitles', '-t', action='store_true', help='Print video subtitles')
1048 |     parser.add_argument('--language', type=str, help='Output only subtitles in the specified language')
1049 |     parser.add_argument('--playlist', '-l', action='store_true', help='Print playlist items')
1050 |     parser.add_argument('--info', '-i', action='store_true', help='Print video info')
1051 |     parser.add_argument('--srt', action='store_true', help='Output subtitles in .srt format.')
1052 |     parser.add_argument('--query', '-q', action='store_true', help='List videos matching the specified query')
1053 |     parser.add_argument('--livechat', action='store_true', help='Follow livechat contents')
1054 |     parser.add_argument('--replay', action='store_true', help='Print livechat replay')
1055 |     parser.add_argument('--proxy', type=str, help='Specify a proxy to use.')
1056 |     parser.add_argument('ytids', nargs='+', type=str, help='One or more Youtube URLs, or IDs, or a query')
1057 |     args = parser.parse_args()
1058 | 
1059 |     if args.proxy and args.proxy.startswith('socks'):
1060 |         load_socks_proxy(args.proxy)
1061 | 
1062 |     yt = Youtube(args)
1063 | 
1064 |     for url in args.ytids:
1065 |         if len(args.ytids) > 1:
1066 |             print("==>", url, "<==")
1067 |         if args.query:
1068 |             # note: the 'url' variable holds the query.
1069 |             # convert it to a query url so the parse link function can decode it.
1070 |             url = "https://www.youtube.com/results?" + urllib.parse.urlencode({"search_query": url})
1071 | 
1072 |         # analyze url for id's, like videoid, channelid, playlistid or search query.
1073 |         for idtype, idvalue in parse_youtube_link(url):
1074 |             # reformat the url in a way that i am sure returns the right json data.
1075 | 
1076 |             if idtype == 'video':
1077 |                 url = "https://www.youtube.com/watch?v=%s" % idvalue
1078 |             elif idtype == 'playlist':
1079 |                 url = "https://www.youtube.com/playlist?list=%s" % idvalue
1080 |             elif idtype == 'channel':
1081 |                 url = "https://www.youtube.com/channel/%s" % idvalue
1082 |             elif idtype == 'username':
1083 |                 url = "https://www.youtube.com/user/%s" % idvalue
1084 |             elif idtype == 'search':
1085 |                 url = "https://www.youtube.com/results?" + urllib.parse.urlencode({"search_query": idvalue})
1086 | 
1087 |             #cfg = yt.getpageinfo(url)
1088 |             #if check_error(cfg):
1089 |             #    continue
1090 |             html = yt.httpreq(url)
1091 |             if args.debug:
1092 |                 print("============ youtube html")
1093 |                 print(html.decode('utf-8'))
1094 |                 print()
1095 |             cfg = filterhtml(html.decode('utf-8'))
1096 |             if args.debug:
1097 |                 print("============ youtube extracted config")
1098 |                 print(json.dumps(cfg))
1099 |                 print()
1100 | 
1101 |             if idtype=='username':
1102 |                 url = channelurl_from_userpage(cfg)
1103 |                 args.ytids.append(url)
1104 |                 # note: the new url is processed in next loop iteration.
1105 | 
1106 |             if args.comments and idtype=='video':
1107 |                 cmt = CommentReader(args, yt, cfg)
1108 |                 cmt.recursecomments()
1109 |             if args.subtitles and idtype=='video':
1110 |                 txt = SubtitleReader(args, yt, cfg)
1111 |                 txt.output()
1112 |             if (args.replay or args.livechat) and idtype=='video':
1113 |                 txt = LivechatReader(args, yt, cfg, live=args.livechat)
1114 |                 txt.recursechat()
1115 |             if args.playlist and idtype=='playlist':
1116 |                 lst = PlaylistReader(args, yt, cfg)
1117 |                 lst.output()
1118 |             if (args.playlist or args.query) and idtype == 'search':
1119 |                 q = SearchReader(args, yt, cfg)
1120 |                 q.recursesearch()
1121 |             if args.info and idtype=='video':
1122 |                 lst = DetailReader(args, yt, cfg)
1123 |                 lst.output()
1124 | 
1125 | 
1126 | if __name__ == '__main__':
1127 |     main()
1128 | 
1129 | 
1130 | 


--------------------------------------------------------------------------------