├── .gitignore ├── README.md └── youtube2srt.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | youtube2srt 2 | =========== 3 | 4 | A small command-line utility I had laying around for a good while that allows you to download closed captions from YouTube as a SRT file. 5 | 6 | ``` 7 | usage: youtube2srt.py [-h] [-l l1 [,l2...]] [-o OUTPUT] VIDEO_URL_OR_FILENAME 8 | 9 | Download closed captions of a YouTube video as a SRT file. 10 | 11 | positional arguments: 12 | VIDEO_URL_OR_FILENAME 13 | YouTube video url or filename 14 | 15 | optional arguments: 16 | -h, --help show this help message and exit 17 | -l l1 [,l2...], --lang l1 [,l2...] 18 | comma separated list of two letter language codes 19 | (default: en) 20 | -o OUTPUT, --output OUTPUT 21 | write captions to FILE instead of video_id.srt 22 | ``` 23 | -------------------------------------------------------------------------------- /youtube2srt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Copyright (c) 2012, Ozan Sener 4 | # Contributor (c) 2016, Paulo Miguel Almeida Rodenas 5 | # All rights reserved. 6 | 7 | # Redistribution and use in source and binary forms, with or without 8 | # modification, are permitted provided that the following conditions are met: 9 | # 10 | # * Redistributions of source code must retain the above copyright notice, 11 | # this list of conditions and the following disclaimer. 12 | # * Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # * Neither the name of copyright holders nor the names of its contributors 16 | # may be used to endorse or promote products derived from this software 17 | # without specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 | # POSSIBILITY OF SUCH DAMAGE. 30 | 31 | try: 32 | from urllib.parse import urlparse, urlencode, parse_qsl 33 | from urllib.request import urlopen 34 | from urllib.error import HTTPError, URLError 35 | from http.client import HTTPException 36 | except ImportError: 37 | from urlparse import urlparse, parse_qsl 38 | from urllib import urlencode 39 | from urllib2 import urlopen, HTTPError, URLError 40 | from httplib import HTTPException 41 | 42 | from os.path import basename, splitext, isfile 43 | import codecs 44 | import argparse 45 | import sys 46 | import xml.etree.ElementTree as ET 47 | from collections import namedtuple 48 | 49 | TRACK_URL = 'http://video.google.com/timedtext?%s' 50 | LIST_URL = 'http://www.youtube.com/api/timedtext?%s' 51 | TRACK_KEYS = 'id name lang_original lang_translated lang_default' 52 | 53 | Track = namedtuple('Track', TRACK_KEYS) 54 | Line = namedtuple('Line', 'start duration text') 55 | 56 | 57 | def save_srt(caption, filename): 58 | """Save a list of srt formatted lines in a srt file with UTF-8 BOM""" 59 | with codecs.open(filename, 'w', 'utf-8-sig') as srt_file: 60 | srt_file.writelines(caption) 61 | 62 | 63 | def retrieve_caption(video_id, languages): 64 | """ 65 | Fetch the first available track in a language list, convert it to srt and 66 | return the list of lines for a given youtube video_id. 67 | """ 68 | track = get_track(video_id, languages) 69 | caption = convert_caption(track) 70 | 71 | return caption 72 | 73 | 74 | def get_track(video_id, languages): 75 | """Return the first available track in a language list for a video.""" 76 | tracks = get_track_list(video_id) 77 | for lang in languages: 78 | if lang in tracks: 79 | break 80 | else: 81 | return 82 | 83 | track = tracks[lang] 84 | url = TRACK_URL % urlencode({'name': track.name, 'lang': lang, 85 | 'v': video_id}) 86 | track = urlopen(url) 87 | 88 | return parse_track(track) 89 | 90 | 91 | def get_track_list(video_id): 92 | """Return the list of available captions for a given youtube video_id.""" 93 | url = LIST_URL % urlencode({'type': 'list', 'v': video_id}) 94 | captions = {} 95 | try: 96 | data = urlopen(url) 97 | tree = ET.parse(data) 98 | for element in tree.iter('track'): 99 | lang = element.get('lang_code') 100 | fields = map(element.get, TRACK_KEYS.split()) 101 | captions[lang] = Track(*fields) 102 | except (URLError, HTTPError, HTTPException) as err: 103 | print("Network error: Unable to retrieve %s\n%s" % (url, err)) 104 | sys.exit(6) 105 | return captions 106 | 107 | 108 | def parse_track(track): 109 | """Parse a track returned by youtube and return a list of lines.""" 110 | lines = [] 111 | 112 | tree = ET.parse(track) 113 | for element in tree.iter('text'): 114 | if not element.text: 115 | continue 116 | start = float(element.get('start')) 117 | # duration is sometimes unspecified 118 | duration = float(element.get('dur') or 0) 119 | text = element.text 120 | lines.append(Line(start, duration, text)) 121 | 122 | return lines 123 | 124 | 125 | def convert_caption(caption): 126 | """Convert each line in a caption to srt format and return a list.""" 127 | if not caption: 128 | return 129 | lines = [] 130 | for num, line in enumerate(caption, 1): 131 | start, duration = line.start, line.duration 132 | if duration: 133 | end = start + duration # duration of the line is specified 134 | else: 135 | if caption[num]: 136 | end = caption[num].start # we use the next start if available 137 | else: 138 | end = start + 5 # last resort 139 | line = u'%(num)i\r\n%(start)s --> %(end)s\r\n%(text)s\r\n\r\n' % \ 140 | {'num': num, 141 | 'start': convert_time(start), 142 | 'end': convert_time(end), 143 | 'text': line.text} 144 | line = line.replace('"', '"')\ 145 | .replace('&', '&')\ 146 | .replace(''', '\'') 147 | lines.append(line) 148 | 149 | return lines 150 | 151 | 152 | def convert_time(time): 153 | """Convert given time to srt format.""" 154 | stime = '%(hours)02d:%(minutes)02d:%(seconds)02d,%(milliseconds)03d' % \ 155 | {'hours': time / 3600, 156 | 'minutes': (time % 3600) / 60, 157 | 'seconds': time % 60, 158 | 'milliseconds': (time % 1) * 1000} 159 | return stime 160 | 161 | 162 | def main(): 163 | parser = argparse.ArgumentParser(description="Download closed captions of \ 164 | a YouTube video as a SRT file.") 165 | parser.add_argument('uri', 166 | metavar='VIDEO_URL_OR_FILENAME', 167 | help="YouTube video url or filename") 168 | parser.add_argument('-l', '--lang', 169 | metavar='l1 [,l2...]', default='en', 170 | help="comma separated list of two letter language \ 171 | codes (default: en)") 172 | parser.add_argument('-o', '--output', 173 | help="write captions to FILE instead of video_id.srt") 174 | args = parser.parse_args() 175 | 176 | if args.uri.startswith('http'): 177 | queries = dict(parse_qsl(urlparse(args.uri).query)) 178 | 179 | video_id = queries.get('v') 180 | output = args.output or video_id 181 | output = output if output.endswith('.srt') else output + '.srt' 182 | lang = args.lang.split(',') 183 | 184 | caption = retrieve_caption(video_id, lang) 185 | 186 | if caption: 187 | save_srt(caption, output) 188 | return 189 | 190 | captions = get_track_list(video_id) 191 | if captions: 192 | print("Available languages:") 193 | for lang in captions: 194 | print(" %(code)s\t%(original)s (%(translated)s)" % \ 195 | {'code': lang, 196 | 'original': captions[lang].lang_original, 197 | 'translated': captions[lang].lang_translated}) 198 | else: 199 | print("There are no subtitles available for this video: %s" % args.uri) 200 | else: 201 | if isfile(args.uri): 202 | output = args.output or splitext(basename(args.uri))[0] 203 | output = output if output.endswith('.srt') else output + '.srt' 204 | 205 | track = parse_track(args.uri) 206 | caption = convert_caption(track) 207 | 208 | if caption: 209 | save_srt(caption, output) 210 | return 211 | else: 212 | print("There is no such file: %s" % args.uri) 213 | 214 | if __name__ == '__main__': 215 | main() 216 | --------------------------------------------------------------------------------