├── .gitignore ├── README.md └── getyoutubecc.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | NOTE: NOT WORKING!!!!!! 2 | ====================================================== 3 | Youtube changed the API so this script is outdated and not working... Not planing on fixing it anytime soon. Sorry 4 | 5 | As an alternative use `youtube-dl` with the `--write-auto-sub` option (thanks to coderholic) 6 | 7 | DESCRIPTION 8 | ======================================================= 9 | 10 | This class allows you to download the caption from a video from you tube 11 | It support: 12 | - Downloading a track if video has multiple tracks (you have to know the name of the track) 13 | - Automatic translation to another language 14 | 15 | 16 | PYTHON CLASS USAGE 17 | ------------------------------------------------------- 18 | 19 | Example: 20 | >>> import getyoutubecc 21 | #import the library 22 | >>> cc = getyoutubecc.getyoutubecc('2XraaWefBd8','en') 23 | # Now in cc.caption_obj are the parsed captions, its syntax is like: 24 | # [{'texlines': [u"caption first line", 'caption second line'], 25 | # 'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }] 26 | # Modify the caption as you want if desired 27 | >>> cc.writeSrtFile('captionsfile.srt') 28 | #write the contents to a srt file 29 | 30 | Notes: 31 | MULTITRACK VIDEO 32 | if video is a multitrack video (or the track has a name) you need 33 | to specify the name of the track: 34 | >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french') 35 | TRANSLATE VIDEO 36 | if you prefer the automatic translation to another language use 37 | the lang code 38 | >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french', tlang:'es') 39 | 40 | 41 | COMMAND LINE USAGE 42 | ------------------------------------------------------- 43 | 44 | If you prefer the command line version of this, or just to test it: 45 | 46 | $ ./getyoutubecc.py -h 47 | getyoutubecc -v -l [-t ] [-T ] 48 | Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es 49 | Example: getyoutubecc -v 2XraaWefBd8 -l en 50 | NOTE: if video has a track name, the -t argument is mandatory 51 | 52 | OPTIONS: 53 | 54 | -v --videoid Video id. Like 2XraaWefBd8. It appears in every youtube URL 55 | -l --language iso lang code. Like en, es, fr.. 56 | -t --track The name of the track if video has several. You will have to find out 57 | this name in the youtube page of the video 58 | -T --translate iso lang code. Like en,es, fr of the language you want to tranlate to 59 | 60 | 61 | COPYRIGHT 62 | ------------------------------------------------------- 63 | 64 | this code is released into the public domain by the copyright holders. 65 | 66 | TODO 67 | ------------------------------------------------------- 68 | 69 | - Test the code, different languages in diferent videos for instance 70 | - Improve regular expresion 71 | - Add support for the automatic generate captions service in youtube 72 | -------------------------------------------------------------------------------- /getyoutubecc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib, HTMLParser, re, codecs 4 | 5 | class getyoutubecc(): 6 | """ This class allows you to download the caption from a video from you tube 7 | Example: 8 | >>> import getyoutubecc 9 | #import the library 10 | >>> cc = getyoutubecc.getyoutubecc('2XraaWefBd8','en') 11 | # Now in cc.caption_obj are the parsed captions, its syntax is like: 12 | # [{'texlines': [u"caption first line", 'caption second line'], 13 | # 'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }] 14 | # Modify the caption as you want if desired 15 | >>> cc.writeSrtFile('captionsfile.srt') 16 | #write the contents to a srt file 17 | Note: 18 | MULTITRACK VIDEO 19 | if video is a multitrack video (or the track has a name) you need 20 | to specify the name of the track: 21 | >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french') 22 | TRANSLATE VIDEO 23 | if you prefer the automatic translation to another language use 24 | the lang code 25 | >>> cc = getyoutubecc.getyoutubecc('pNiFoYt69-w','fr','french', tlang:'es') 26 | """ 27 | 28 | caption_obj = {} 29 | 30 | """ This object contains the fetched captions. Use this to treat the captions or whatever""" 31 | def __init__(self, video_id, lang="en", track="", tlang="" ): 32 | """ """ 33 | #Obtain the file from internet 34 | cc_url = "http://youtube.com/api/timedtext?v=" + video_id + "&lang=" + lang + "&name=" + track + "&tlang=" + tlang 35 | print "video id: " + video_id 36 | print "video language: " + lang 37 | print "video track: " + track 38 | print "translate video to: " + tlang 39 | try: 40 | cc = urllib.urlopen(cc_url).read() 41 | except: 42 | print "Problem with connection" 43 | #parse the file to make a easy to modify object with the captions and its time 44 | if self.caption_obj == []: 45 | print "url " + cc_url + " was an empty response. Multitrack video?" 46 | self.caption_obj = self._parseXml(cc); 47 | 48 | def writeSrtFile(self,filename="caption"): 49 | srt_lines = self._generateSrt(self.caption_obj) #generate the srt file 50 | srtfile = open(filename,'w') 51 | for line in srt_lines: 52 | srtfile.write( line.encode('utf8') + "\n") 53 | 54 | def _parseXml(self,cc): 55 | """ INPUT: XML file with captions 56 | OUTPUT: parsed object like: 57 | [{'texlines': [u"So, I'm going to rewrite this", 'in a more concise form as'], 58 | 'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }] 59 | """ 60 | htmlpar = HTMLParser.HTMLParser() 61 | cc = cc.split("") # ['So, it will\nhas time', 'blah', ..] 62 | captions = [] 63 | for line in cc: 64 | if re.search('text', line): 65 | time = re.search(r'start="(\d+)(?:\.(\d+)){0,1}', line).groups() # ('2997','929') 66 | time = ( int(time[0]), int(0 if not time[1] else time[1]) ) 67 | #convert seconds and millisec to int 68 | text = re.search(r'">(.*)', line, re.DOTALL).group(1) # extract text i.e. 'So, it will\nhas time' 69 | textlines = [ htmlpar.unescape(htmlpar.unescape( unicode(lineunparsed,"utf-8") )) for lineunparsed in text.split('\n') ] 70 | #unscape chars like & or ' 71 | ntime = {'hours':time[0]/3600,"min":time[0]%3600/60,"sec":time[0]%3600%60,"msec":time[1]} 72 | captions.append({'time':ntime,'textlines':textlines}) 73 | return captions 74 | 75 | def _generateSrt(self,captions): 76 | """ INPUT: array with captions, i.e. 77 | [{'texlines': [u"So, I'm going to rewrite this", 'in a more concise form as'], 78 | 'time': {'hours':'1', 'min':'2','sec':44,'msec':232} }] 79 | OUTPUT: srtformated string 80 | """ 81 | caption_number = 0 82 | srt_output = [] 83 | for caption in captions: 84 | caption_number += 1 85 | #CAPTION NUMBER 86 | srt_output.append(str(caption_number)) 87 | #TIME 88 | time_from = ( caption['time']['hours'], caption['time']['min'], caption['time']['sec'], caption['time']['msec'] ) 89 | if len(captions)>caption_number: 90 | #display caption until next one 91 | next_caption_time = captions[caption_number]['time'] 92 | time_to = ( next_caption_time['hours'], next_caption_time['min'], next_caption_time['sec'], next_caption_time['msec'] ) 93 | else: 94 | #display caption for 2 seconds 95 | time_to = (time_from[0],time_from[1]+2,time_from[2],time_from[3]) 96 | srt_output.append( (":").join([str(i) for i in time_from[0:-1]])+","+str(time_from[-1])+" --> "+(":").join([str(i) for i in time_to[0:-1]])+","+str(time_to[-1])) 97 | #CAPTIONS 98 | for caption_line in caption['textlines']: 99 | srt_output.append(caption_line) 100 | #Add two empty lines to serarate every caption showed 101 | srt_output.append("") 102 | srt_output.append("") 103 | return srt_output 104 | 105 | 106 | if __name__ == "__main__": 107 | import sys, getopt 108 | sys.argv 109 | 110 | videoid = '' 111 | lang = '' 112 | track = '' 113 | tlang = '' 114 | 115 | try: 116 | opts, args = getopt.getopt(sys.argv[1:],"hv:l:t:T:",["videoid=","language=","track=","translate="]) 117 | except getopt.GetoptError: 118 | print 'getyoutubecc -v -l -t -T ' 119 | print 'Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es' 120 | print 'Example: getyoutubecc -v 2XraaWefBd8 -l en ' 121 | sys.exit(2) 122 | for opt, arg in opts: 123 | if opt == '-h': 124 | print 'getyoutubecc -v -l -t -T ' 125 | print 'Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es' 126 | print 'Example: getyoutubecc -v 2XraaWefBd8 -l en ' 127 | print 'NOTE: if video has a track name, the -t argument is mandatory ' 128 | sys.exit() 129 | elif opt in ("-v", "--videoid"): 130 | videoid = arg 131 | elif opt in ("-l", "--language"): 132 | lang = arg 133 | elif opt in ("-t", "--track"): 134 | track = arg 135 | elif opt in ("-T", "--translate"): 136 | tlang = arg 137 | if videoid != '': 138 | print "downloading " + videoid + " captions" 139 | cc = getyoutubecc(videoid, lang, track, tlang) 140 | cc.writeSrtFile(videoid + '.srt') 141 | else: 142 | print 'getyoutubecc -v -l -t -T ' 143 | print 'Example: getyoutubecc -v pNiFoYt69-w -l fr -t french -T es' 144 | print 'Example: getyoutubecc -v 2XraaWefBd8 -l en ' 145 | print 'NOTE: if video has a track name, the -t argument is mandatory ' 146 | --------------------------------------------------------------------------------