├── .gitignore
├── README.md
└── getyoutubecc.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | NOTE: NOT WORKING!!!!!!
 2 | ======================================================
 3 | Youtube changed the API so this script is outdated and not working... Not planing on fixing it anytime soon. Sorry
 4 | 
 5 | As an alternative use `youtube-dl` with the `--write-auto-sub` option (thanks to coderholic)
 6 | 
 7 | DESCRIPTION
 8 | =======================================================
 9 | 
10 | This class allows you to download the caption from a video from you tube
11 | It support: 
12 |     - Downloading a track if video has multiple tracks (you have to know the name of the track)
13 |     - Automatic translation to another language
14 | 
15 | 
16 | PYTHON CLASS USAGE
17 | -------------------------------------------------------
18 | 
19 |     Example:
20 |             >>> import getyoutubecc
21 |             #import the library
22 |             >>> cc = getyoutubecc.getyoutubecc('2XraaWefBd8','en')
23 |             # Now in cc.caption_obj are the parsed captions, its syntax is like:
24 |             # [{'texlines': [u"caption first line", 'caption second line'],
25 |             #    'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }]
26 |             # Modify the caption as you want if desired
27 |             >>> cc.writeSrtFile('captionsfile.srt')
28 |             #write the contents to a srt file
29 |          
30 |          Notes:
31 |            MULTITRACK VIDEO
32 |            if video is a multitrack video (or the track has a name) you need
33 |            to specify the name of the track:
34 |             >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french')
35 |            TRANSLATE VIDEO
36 |            if you prefer the automatic translation to another language use 
37 |            the lang code
38 |             >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french', tlang:'es')
39 | 
40 | 
41 | COMMAND LINE USAGE
42 | -------------------------------------------------------
43 | 
44 | If you prefer the command line version of this, or just to test it:
45 | 
46 |             $ ./getyoutubecc.py -h
47 |             getyoutubecc -v <video_id> -l <language_id> [-t <track_name>] [-T <translate_to>]
48 |             Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es
49 |             Example: getyoutubecc -v 2XraaWefBd8 -l en 
50 |             NOTE: if video has a track name, the -t argument is mandatory 
51 | 
52 | OPTIONS:
53 | 
54 |     -v --videoid        Video id. Like 2XraaWefBd8. It appears in every youtube URL
55 |     -l --language       iso lang code. Like en, es, fr..
56 |     -t --track          The name of the track if video has several. You will have to find out
57 |                         this name in the youtube page of the video
58 |     -T --translate      iso lang code. Like en,es, fr of the language you want to tranlate to
59 | 
60 | 
61 | COPYRIGHT
62 | -------------------------------------------------------
63 | 
64 | this code is released into the public domain by the copyright holders.
65 | 
66 | TODO
67 | -------------------------------------------------------
68 | 
69 | - Test the code, different languages in diferent videos for instance
70 | - Improve regular expresion
71 | - Add support for the automatic generate captions service in youtube
72 | 


--------------------------------------------------------------------------------
/getyoutubecc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import urllib, HTMLParser, re, codecs
  4 | 
  5 | class getyoutubecc():
  6 |     """ This class allows you to download the caption from a video from you tube
  7 |         Example:
  8 |             >>> import getyoutubecc
  9 |             #import the library
 10 |             >>> cc = getyoutubecc.getyoutubecc('2XraaWefBd8','en')
 11 |             # Now in cc.caption_obj are the parsed captions, its syntax is like:
 12 |             # [{'texlines': [u"caption first line", 'caption second line'],
 13 |             #    'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }]
 14 |             # Modify the caption as you want if desired
 15 |             >>> cc.writeSrtFile('captionsfile.srt')
 16 |             #write the contents to a srt file
 17 |          Note:
 18 |            MULTITRACK VIDEO
 19 |            if video is a multitrack video (or the track has a name) you need
 20 |            to specify the name of the track:
 21 |             >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french')
 22 |            TRANSLATE VIDEO
 23 |            if you prefer the automatic translation to another language use 
 24 |            the lang code
 25 |             >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french', tlang:'es')
 26 |     """
 27 |     
 28 |     caption_obj = {}
 29 | 
 30 |     """ This object contains the fetched captions. Use this to treat the captions or whatever"""
 31 |     def __init__(self, video_id, lang="en", track="", tlang="" ):
 32 |         """ """
 33 |         #Obtain the file from internet
 34 |         cc_url = "http://youtube.com/api/timedtext?v=" + video_id + "&lang=" + lang + "&name=" + track + "&tlang=" + tlang
 35 |         print "video id: " + video_id
 36 |         print "video language: " + lang
 37 |         print "video track: " + track
 38 |         print "translate video to: " + tlang
 39 |         try:
 40 |             cc = urllib.urlopen(cc_url).read() 
 41 |         except:
 42 |             print "Problem with connection"
 43 |         #parse the file to make a easy to modify object with the captions and its time
 44 |         if self.caption_obj == []:
 45 |           print "url " + cc_url + " was an empty response. Multitrack video?"
 46 |         self.caption_obj = self._parseXml(cc);
 47 | 
 48 |     def writeSrtFile(self,filename="caption"):
 49 |         srt_lines = self._generateSrt(self.caption_obj) #generate the srt file
 50 |         srtfile = open(filename,'w')
 51 |         for line in srt_lines:
 52 |             srtfile.write( line.encode('utf8') + "\n")
 53 | 
 54 |     def _parseXml(self,cc):
 55 |         """ INPUT: XML file with captions
 56 |             OUTPUT: parsed object like:
 57 |                 [{'texlines': [u"So, I'm going to rewrite this", 'in a more concise form as'],
 58 |                 'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }]
 59 |         """
 60 |         htmlpar = HTMLParser.HTMLParser()
 61 |         cc = cc.split("</text>") # ['<text start="2997.929">So, it will\nhas time', '<text start="3000.929">blah', ..]
 62 |         captions = []
 63 |         for line in cc:
 64 |             if re.search('text', line):
 65 |                 time = re.search(r'start="(\d+)(?:\.(\d+)){0,1}', line).groups() # ('2997','929')
 66 |                 time = ( int(time[0]), int(0 if not time[1] else time[1]) )
 67 |                     #convert seconds and millisec to int
 68 |                 text = re.search(r'">(.*)', line, re.DOTALL).group(1) # extract text i.e. 'So, it will\nhas time'
 69 |                 textlines = [ htmlpar.unescape(htmlpar.unescape( unicode(lineunparsed,"utf-8") )) for lineunparsed in text.split('\n') ] 
 70 |                     #unscape chars like &amp; or &#39;
 71 |                 ntime = {'hours':time[0]/3600,"min":time[0]%3600/60,"sec":time[0]%3600%60,"msec":time[1]}
 72 |                 captions.append({'time':ntime,'textlines':textlines})
 73 |         return captions
 74 | 
 75 |     def _generateSrt(self,captions):
 76 |         """ INPUT: array with captions, i.e.
 77 |                 [{'texlines': [u"So, I'm going to rewrite this", 'in a more concise form as'],
 78 |                 'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }]
 79 |             OUTPUT: srtformated string
 80 |         """
 81 |         caption_number = 0
 82 |         srt_output = []
 83 |         for caption in captions:
 84 |             caption_number += 1
 85 |             #CAPTION NUMBER
 86 |             srt_output.append(str(caption_number))
 87 |             #TIME
 88 |             time_from = ( caption['time']['hours'], caption['time']['min'], caption['time']['sec'], caption['time']['msec'] ) 
 89 |             if len(captions)>caption_number:
 90 |                 #display caption until next one
 91 |                 next_caption_time = captions[caption_number]['time']
 92 |                 time_to = ( next_caption_time['hours'], next_caption_time['min'], next_caption_time['sec'], next_caption_time['msec'] )
 93 |             else:
 94 |                 #display caption for 2 seconds
 95 |                 time_to = (time_from[0],time_from[1]+2,time_from[2],time_from[3]) 
 96 |             srt_output.append( (":").join([str(i) for i in time_from[0:-1]])+","+str(time_from[-1])+" --> "+(":").join([str(i) for i in time_to[0:-1]])+","+str(time_to[-1]))
 97 |             #CAPTIONS
 98 |             for caption_line in caption['textlines']:
 99 |                 srt_output.append(caption_line)
100 |             #Add two empty lines to serarate every caption showed
101 |             srt_output.append("")
102 |             srt_output.append("")
103 |         return srt_output
104 | 
105 |         
106 | if __name__ == "__main__":
107 |     import sys, getopt
108 |     sys.argv
109 | 
110 |     videoid = ''
111 |     lang = ''
112 |     track = ''
113 |     tlang = ''
114 | 
115 |     try:
116 |       opts, args = getopt.getopt(sys.argv[1:],"hv:l:t:T:",["videoid=","language=","track=","translate="])
117 |     except getopt.GetoptError:
118 |         print 'getyoutubecc -v <video_id> -l <language_id> -t <track_name> -T <translate_to>'
119 |         print 'Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es'
120 |         print 'Example: getyoutubecc -v 2XraaWefBd8 -l en ' 
121 |         sys.exit(2)
122 |     for opt, arg in opts:
123 |         if opt == '-h':
124 |           print 'getyoutubecc -v <video_id> -l <language_id> -t <track_name> -T <translate_to>'
125 |           print 'Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es'
126 |           print 'Example: getyoutubecc -v 2XraaWefBd8 -l en ' 
127 |           print 'NOTE: if video has a track name, the -t argument is mandatory ' 
128 |           sys.exit()
129 |         elif opt in ("-v", "--videoid"):
130 |           videoid = arg
131 |         elif opt in ("-l", "--language"):
132 |           lang = arg
133 |         elif opt in ("-t", "--track"):
134 |           track = arg
135 |         elif opt in ("-T", "--translate"):
136 |           tlang = arg
137 |     if videoid != '':
138 |       print "downloading " + videoid + " captions"
139 |       cc = getyoutubecc(videoid, lang, track, tlang)
140 |       cc.writeSrtFile(videoid + '.srt')
141 |     else:
142 |       print 'getyoutubecc -v <video_id> -l <language_id> -t <track_name> -T <translate_to>'
143 |       print 'Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es'
144 |       print 'Example: getyoutubecc -v 2XraaWefBd8 -l en ' 
145 |       print 'NOTE: if video has a track name, the -t argument is mandatory ' 
146 | 


--------------------------------------------------------------------------------