├── .gitignore
├── README.md
└── youtube2srt.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | youtube2srt
 2 | ===========
 3 | 
 4 | A small command-line utility I had laying around for a good while that allows you to download closed captions from YouTube as a SRT file.
 5 | 
 6 | ```
 7 | usage: youtube2srt.py [-h] [-l l1 [,l2...]] [-o OUTPUT] VIDEO_URL_OR_FILENAME
 8 | 
 9 | Download closed captions of a YouTube video as a SRT file.
10 | 
11 | positional arguments:
12 |   VIDEO_URL_OR_FILENAME
13 |                         YouTube video url or filename
14 | 
15 | optional arguments:
16 |   -h, --help            show this help message and exit
17 |   -l l1 [,l2...], --lang l1 [,l2...]
18 |                         comma separated list of two letter language codes
19 |                         (default: en)
20 |   -o OUTPUT, --output OUTPUT
21 |                         write captions to FILE instead of video_id.srt
22 | ```
23 | 


--------------------------------------------------------------------------------
/youtube2srt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Copyright (c) 2012, Ozan Sener
  4 | # Contributor (c) 2016, Paulo Miguel Almeida Rodenas
  5 | # All rights reserved.
  6 | 
  7 | # Redistribution and use in source and binary forms, with or without
  8 | # modification, are permitted provided that the following conditions are met:
  9 | #
 10 | #   * Redistributions of source code must retain the above copyright notice,
 11 | #     this list of conditions and the following disclaimer.
 12 | #   * Redistributions in binary form must reproduce the above copyright notice,
 13 | #     this list of conditions and the following disclaimer in the documentation
 14 | #     and/or other materials provided with the distribution.
 15 | #   * Neither the name of copyright holders nor the names of its contributors
 16 | #     may be used to endorse or promote products derived from this software
 17 | #     without specific prior written permission.
 18 | #
 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22 | # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 23 | # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 24 | # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 25 | # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 26 | # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 27 | # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 28 | # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 29 | # POSSIBILITY OF SUCH DAMAGE.
 30 | 
 31 | try:
 32 |     from urllib.parse import urlparse, urlencode, parse_qsl
 33 |     from urllib.request import urlopen
 34 |     from urllib.error import HTTPError, URLError
 35 |     from http.client import HTTPException
 36 | except ImportError:
 37 |     from urlparse import urlparse, parse_qsl
 38 |     from urllib import urlencode
 39 |     from urllib2 import urlopen, HTTPError, URLError
 40 |     from httplib import HTTPException
 41 | 
 42 | from os.path import basename, splitext, isfile
 43 | import codecs
 44 | import argparse
 45 | import sys
 46 | import xml.etree.ElementTree as ET
 47 | from collections import namedtuple
 48 | 
 49 | TRACK_URL = 'http://video.google.com/timedtext?%s'
 50 | LIST_URL = 'http://www.youtube.com/api/timedtext?%s'
 51 | TRACK_KEYS = 'id name lang_original lang_translated lang_default'
 52 | 
 53 | Track = namedtuple('Track', TRACK_KEYS)
 54 | Line = namedtuple('Line', 'start duration text')
 55 | 
 56 | 
 57 | def save_srt(caption, filename):
 58 |     """Save a list of srt formatted lines in a srt file with UTF-8 BOM"""
 59 |     with codecs.open(filename, 'w', 'utf-8-sig') as srt_file:
 60 |         srt_file.writelines(caption)
 61 | 
 62 | 
 63 | def retrieve_caption(video_id, languages):
 64 |     """
 65 |     Fetch the first available track in a language list, convert it to srt and
 66 |     return the list of lines for a given youtube video_id.
 67 |     """
 68 |     track = get_track(video_id, languages)
 69 |     caption = convert_caption(track)
 70 | 
 71 |     return caption
 72 | 
 73 | 
 74 | def get_track(video_id, languages):
 75 |     """Return the first available track in a language list for a video."""
 76 |     tracks = get_track_list(video_id)
 77 |     for lang in languages:
 78 |         if lang in tracks:
 79 |             break
 80 |     else:
 81 |         return
 82 | 
 83 |     track = tracks[lang]
 84 |     url = TRACK_URL % urlencode({'name': track.name, 'lang': lang,
 85 |                                         'v': video_id})
 86 |     track = urlopen(url)
 87 | 
 88 |     return parse_track(track)
 89 | 
 90 | 
 91 | def get_track_list(video_id):
 92 |     """Return the list of available captions for a given youtube video_id."""
 93 |     url = LIST_URL % urlencode({'type': 'list', 'v': video_id})
 94 |     captions = {}
 95 |     try:
 96 |         data = urlopen(url)
 97 |         tree = ET.parse(data)
 98 |         for element in tree.iter('track'):
 99 |             lang = element.get('lang_code')
100 |             fields = map(element.get, TRACK_KEYS.split())
101 |             captions[lang] = Track(*fields)
102 |     except (URLError, HTTPError, HTTPException) as err:
103 |         print("Network error: Unable to retrieve %s\n%s" % (url, err))
104 |         sys.exit(6)
105 |     return captions
106 | 
107 | 
108 | def parse_track(track):
109 |     """Parse a track returned by youtube and return a list of lines."""
110 |     lines = []
111 | 
112 |     tree = ET.parse(track)
113 |     for element in tree.iter('text'):
114 |         if not element.text:
115 |             continue
116 |         start = float(element.get('start'))
117 |         # duration is sometimes unspecified
118 |         duration = float(element.get('dur') or 0)
119 |         text = element.text
120 |         lines.append(Line(start, duration, text))
121 | 
122 |     return lines
123 | 
124 | 
125 | def convert_caption(caption):
126 |     """Convert each line in a caption to srt format and return a list."""
127 |     if not caption:
128 |         return
129 |     lines = []
130 |     for num, line in enumerate(caption, 1):
131 |         start, duration = line.start, line.duration
132 |         if duration:
133 |             end = start + duration  # duration of the line is specified
134 |         else:
135 |             if caption[num]:
136 |                 end = caption[num].start  # we use the next start if available
137 |             else:
138 |                 end = start + 5  # last resort
139 |         line = u'%(num)i\r\n%(start)s --> %(end)s\r\n%(text)s\r\n\r\n' % \
140 |                {'num': num,
141 |                 'start': convert_time(start),
142 |                 'end': convert_time(end),
143 |                 'text': line.text}
144 |         line = line.replace('&quot;', '"')\
145 |                    .replace('&amp;', '&')\
146 |                    .replace('&#39;', '\'')
147 |         lines.append(line)
148 | 
149 |     return lines
150 | 
151 | 
152 | def convert_time(time):
153 |     """Convert given time to srt format."""
154 |     stime = '%(hours)02d:%(minutes)02d:%(seconds)02d,%(milliseconds)03d' % \
155 |             {'hours': time / 3600,
156 |              'minutes': (time % 3600) / 60,
157 |              'seconds': time % 60,
158 |              'milliseconds': (time % 1) * 1000}
159 |     return stime
160 | 
161 | 
162 | def main():
163 |     parser = argparse.ArgumentParser(description="Download closed captions of \
164 |                                      a YouTube video as a SRT file.")
165 |     parser.add_argument('uri',
166 |                         metavar='VIDEO_URL_OR_FILENAME',
167 |                         help="YouTube video url or filename")
168 |     parser.add_argument('-l', '--lang',
169 |                         metavar='l1 [,l2...]', default='en',
170 |                         help="comma separated list of two letter language \
171 |                         codes (default: en)")
172 |     parser.add_argument('-o', '--output',
173 |                         help="write captions to FILE instead of video_id.srt")
174 |     args = parser.parse_args()
175 | 
176 |     if args.uri.startswith('http'):
177 |         queries = dict(parse_qsl(urlparse(args.uri).query))
178 | 
179 |         video_id = queries.get('v')
180 |         output = args.output or video_id
181 |         output = output if output.endswith('.srt') else output + '.srt'
182 |         lang = args.lang.split(',')
183 | 
184 |         caption = retrieve_caption(video_id, lang)
185 | 
186 |         if caption:
187 |             save_srt(caption, output)
188 |             return
189 | 
190 |         captions = get_track_list(video_id)
191 |         if captions:
192 |             print("Available languages:")
193 |             for lang in captions:
194 |                 print("  %(code)s\t%(original)s (%(translated)s)" % \
195 |                     {'code': lang,
196 |                      'original': captions[lang].lang_original,
197 |                      'translated': captions[lang].lang_translated})
198 |         else:
199 |             print("There are no subtitles available for this video: %s" % args.uri)
200 |     else:
201 |         if isfile(args.uri):
202 |             output = args.output or splitext(basename(args.uri))[0]
203 |             output = output if output.endswith('.srt') else output + '.srt'
204 | 
205 |             track = parse_track(args.uri)
206 |             caption = convert_caption(track)
207 | 
208 |             if caption:
209 |                 save_srt(caption, output)
210 |                 return
211 |         else:
212 |             print("There is no such file: %s" % args.uri)
213 | 
214 | if __name__ == '__main__':
215 |     main()
216 | 


--------------------------------------------------------------------------------