├── .gitignore ├── chrome ├── source.crx ├── source │ ├── ikona128.png │ ├── manifest.json │ ├── background.html │ └── js │ │ └── inject.js └── source.pem ├── youtube_dl ├── __main__.py ├── PostProcessor.py ├── utils.py ├── __init__.py ├── FileDownloader.py └── InfoExtractors.py ├── install.sh ├── readme.md ├── index.html └── youtube-dl-server /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /chrome/source.crx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dz0ny/youtube-dl-server/HEAD/chrome/source.crx -------------------------------------------------------------------------------- /chrome/source/ikona128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dz0ny/youtube-dl-server/HEAD/chrome/source/ikona128.png -------------------------------------------------------------------------------- /youtube_dl/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import __init__ 5 | 6 | if __name__ == '__main__': 7 | __init__.main() 8 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GITHUB=https://github.com/dz0ny/youtube-dl-server 3 | 4 | rm -rf ~/youtube-dl-server 5 | mkdir ~/youtube-dl-server 6 | cd ~/youtube-dl-server 7 | curl -L $GITHUB/tarball/master -O ./youtube-dl-server.tar.gz 8 | tar -zxvf ./youtube-dl-server.tar.gz --strip 1 9 | rm ./youtube-dl-server.tar.gz 10 | rm ./install.sh 11 | chmod a+x ./youtube-dl-server -------------------------------------------------------------------------------- /chrome/source/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "YoutubeDL", 3 | "version": "0.0.4", 4 | "description": "Downloads youtube videos as mp3", 5 | "icons": {"128": "ikona128.png" }, 6 | "page_action": { 7 | "default_icon": "ikona128.png", // optional 8 | "default_title": "Download MP3" 9 | }, 10 | "background_page": "background.html", 11 | "permissions": [ 12 | "", 13 | "unlimitedStorage", 14 | "tabs" 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /chrome/source/background.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 28 | 29 | -------------------------------------------------------------------------------- /chrome/source.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIICeAIBADANBgkqhkiG9w0BAQEFAASCAmIwggJeAgEAAoGBAOqQoSNZfRe4Ra9cb 3 | ZrlJjcVVdl3DRMsQO54JRnlBU61AAoFT+Q4ozKsuXmLxK8IuQbLpmlMCeAqZKEKSp 4 | 91eJGaiYBBTvZyZUaxCjtf2d/T70mc/9aENrT257t9B7FoIksX9HycIHB5oBw/t+3 5 | FqLFl1a7uLGChg66IPJgPGujRAgMBAAECgYEAn87NmfHcIg7vmxvTqNY6BQlKJhDQ 6 | HaHm0xGT5WJ9DTSPxEP+PDTCK0I2UzMAW2gL9y9EPzUI/WqkiHskgCNecjbo5YIMm 7 | DAOlN4KPiZxqb1K4zJWrlkWUkju94ZrHXUGELesChpqONvj8ImVT/2KyIGFFfdLpW 8 | fYYNPqojEaB4kCQQD57dDZVfi3twlTrCWWQ/W+Iip7RxbqvcCx2XeeFZFckCn0ck/ 9 | BK+GFrA+kJNFpslSa0nKFvsdd5cNScXsXVT3DAkEA8ENFwW0DNOoADwqyPSC4kLk5 10 | PxbamBTFWeguQlloqilkRJA8KT0IprUPCpwx72ZwuIqs3L1hvna1mbJa+KRx2wJBA 11 | Ju5A8IHARtm3lbWEe1YlstK+nEpUCwe4uttdkx3X8TuxlVazDquHqxtEqnRjvFufa 12 | yhp12SCyKEQHkj3/Af2oUCQQDJeOAJrww9VuvtsR59u+6JDYk/qj5rwR8soVIJOUh 13 | XSJZYGEsamZ+ji7itQQVupwxm84K5J+XK/WiTFcjLL7p5AkBvYTCV/KMxruoNSC35 14 | heNWqYKQ8au+9amVG4UNVqTJEeCePbzNsFn4Q2QQA2Yxy2f/YIjP8sUmlfO9S8Uoh 15 | ovr 16 | -----END PRIVATE KEY----- 17 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # youtube-dl-server 2 | 3 | Allows you to download mp3 or mp4 from popular video sites. You can use bookmarklets or Chrome extension to initiate download. 4 | 5 | ## Usage 6 | 7 | $ youtube-dl-server 8 | 9 | ## Installation 10 | 11 | $ sudo apt-get install ffmpeg 12 | $ curl -L https://github.com/dz0ny/youtube-dl-server/raw/master/install.sh | bash 13 | 14 | # License 15 | 16 | (The MIT License) 17 | 18 | Copyright (c) 2011 Janez Troha 19 | 20 | Permission is hereby granted, free of charge, to any person obtaining 21 | a copy of this software and associated documentation files (the 22 | 'Software'), to deal in the Software without restriction, including 23 | without limitation the rights to use, copy, modify, merge, publish, 24 | distribute, sublicense, and/or sell copies of the Software, and to 25 | permit persons to whom the Software is furnished to do so, subject to 26 | the following conditions: 27 | 28 | The above copyright notice and this permission notice shall be 29 | included in all copies or substantial portions of the Software. 30 | 31 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 32 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 34 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 35 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 36 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 37 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /chrome/source/js/inject.js: -------------------------------------------------------------------------------- 1 | 2 | (function(window, undefined) { 3 | $("textarea").attr("placeholder", "Enter translation...") 4 | $("#q").replaceWith('') 5 | $('tr:first').html($('.search').remove().clone()); 6 | $(document).bind('keydown', 'ctrl+x', function() { 7 | $("form.edit_tolk_locale").submit(); 8 | }); 9 | $(document).bind('keydown', 'ctrl+right', function() { 10 | chrome.extension.sendRequest({'action' : 'navigate', "url" : "https://teambox.com" + $(".next_page").attr("href")}); 11 | }); 12 | $(document).bind('keydown', 'ctrl+left', function() { 13 | chrome.extension.sendRequest({'action' : 'navigate', "url" :"https://teambox.com" + $(".previous_page").attr("href")}); 14 | }); 15 | $('#head h1').append('Download translation'); 16 | $('.translations tr').append(''); 17 | $('nav:first').remove(); 18 | $(".button.google_translate").live("click", function(event) { 19 | event.preventDefault(); 20 | var translation = $(this).parent().parent(); 21 | var phrase = translation.find(".phrase").contents() 22 | .filter(function(){ return(this.nodeType == 3); }) 23 | .text() 24 | .replace(/^\W/g, "") 25 | .replace(/\s{2,}/g, "") 26 | var aT = new Translate(phrase); 27 | aT.search(function(result) { 28 | translation.find("textarea").val(result); 29 | }); 30 | return false; 31 | }); 32 | $(".button.copy").live("click", function(event) { 33 | event.preventDefault(); 34 | var translation = $(this).parent().parent(); 35 | var phrase = translation.find(".phrase").contents() 36 | .filter(function(){ return(this.nodeType == 3); }) 37 | .text() 38 | .replace(/^\W/g, "") 39 | .replace(/\s{2,}/g, "") 40 | translation.find("textarea").val(phrase); 41 | return false; 42 | }); 43 | $(".button.clean").live("click", function(event) { 44 | event.preventDefault(); 45 | var translation = $(this).parent().parent(); 46 | translation.find("textarea").val(""); 47 | return false; 48 | }); 49 | $(".button.download").live("click", function(event) { 50 | event.preventDefault(); 51 | var url = "https://teambox.com" + $("form").attr("action") + ".yml"; 52 | chrome.extension.sendRequest({'action' : 'download', "url" : url}); 53 | return false; 54 | }); 55 | var Translate = function(wordToTranslate){ 56 | this.rubyC = []; 57 | this.init(this,wordToTranslate) 58 | }; 59 | Translate.prototype = { 60 | init: function(self, wordToTranslate){ 61 | self.wordToTranslate = self.parse(wordToTranslate); 62 | self.locale = $("form").attr("action").replace("/tolk/locales/", ""); 63 | }, 64 | _req: function(url, callback){ 65 | chrome.extension.sendRequest({'action' : '_req', "url" : url}, callback); 66 | }, 67 | parse: function(toParse){ 68 | var self = this; 69 | var ret = ""; 70 | if (toParse.indexOf("---") == -1) { 71 | if (toParse.indexOf("{") !== -1) { 72 | ret = toParse.replace(/(%{.*?})/ig, function(text) { 73 | self.rubyC.push(text); 74 | return ""+(self.rubyC.length-1)+""; 75 | }) 76 | 77 | }else{ 78 | ret = toParse.replace(/\s<(.+)>(\d+)<\/.+>\s<\/esc>/ig, "<$1>$2").replace(/(\d+)<\/esc>|\s(\d+)\s<\/esc>/ig, function(text, index, indexa) { 79 | return self.rubyC[(index?index:indexa)]; 80 | }) 81 | } 82 | 83 | }else{ 84 | 85 | alert("This types of phrases aren't supported!") 86 | } 87 | return ret; 88 | 89 | }, 90 | search: function(callback){ 91 | var self = this; 92 | this._req("https://www.googleapis.com/language/translate/v2?key=AIzaSyCqbDDq_gkCnhpiSfKnOedtJmaBZMZPdp8&format=html&q="+encodeURIComponent(this.wordToTranslate)+"&source=en&target="+this.locale, function(json) { 93 | var word = self.parse( JSON.parse(json).data.translations[0].translatedText ); 94 | callback(word); 95 | }); 96 | } 97 | } 98 | 99 | })(window); -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | youtubedl-server 8 | 9 | 10 | 11 |

12 |

13 |

youtubedl-server

14 |

15 |

Bookmarklets(drag to toolbar)

21 |

22 |

API

23 |

25 | http://localhost:9099/?music320aac=url Download and convert to mp3 26 |
28 | http://localhost:9099/?music320mp3=url Download and convert to aac 29 |
31 | http://localhost:9099/?musicbest=url Download and convert to best format 32 |
34 | http://localhost:9099/?video=url Download video 35 |

37 |

38 |

39 |

Chrome Extension

40 | Install extension 41 |

42 |

43 |

License

44 |

(The MIT License) Copyright (c) 2011 Janez Troha 45 | Permission is hereby granted, free of charge, to any person obtaining 46 | a copy of this software and associated documentation files (the 'Software'), 47 | to deal in the Software without restriction, including without limitation 48 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 49 | and/or sell copies of the Software, and to permit persons to whom the Software 50 | is furnished to do so, subject to the following conditions: The above copyright 51 | notice and this permission notice shall be included in all copies or substantial 52 | portions of the Software. THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY 53 | OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 54 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 55 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 56 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 57 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 58 | DEALINGS IN THE SOFTWARE.

59 |

60 |

61 |

62 | 63 | 64 | -------------------------------------------------------------------------------- /youtube-dl-server: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Author: Janez Troha 5 | # License: MIT 6 | # NOTE: This script heavily relies on Ubuntu 7 | # TODO: * Queue for downloading, converting 8 | # * Support for other OSes 9 | 10 | import sys 11 | import os 12 | import threading 13 | import webbrowser 14 | import re 15 | import youtube_dl 16 | import subprocess 17 | import shlex 18 | 19 | from urlparse import urlparse 20 | from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer 21 | 22 | try: 23 | subprocess.call(['ffmpeg'], stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT) 24 | except (OSError, IOError): 25 | raise sys.exit(u'ERROR: "ffmpeg" could not be found, please install it!') 26 | 27 | try: 28 | import pynotify 29 | if not pynotify.init('Youtube-DL'): 30 | print 'there was a problem initializing the pynotify module' 31 | except: 32 | print "you don't seem to have pynotify installed" 33 | 34 | ##Download locations detection 35 | 36 | userhome = os.path.expanduser('~') 37 | try: 38 | 39 | # This is Ubuntu flavored thing 40 | 41 | userdirs = open(os.path.join(userhome, '.config/user-dirs.dirs'), 'r') 42 | paths = userdirs.read() 43 | regexVideo = re.compile('XDG_VIDEOS_DIR="(.+)"', re.MULTILINE | re.UNICODE) 44 | regexMusic = re.compile('XDG_MUSIC_DIR="(.+)"', re.MULTILINE | re.UNICODE) 45 | VideoPath = regexVideo.findall(paths)[0].replace('$HOME', userhome) 46 | MusicPath = regexMusic.findall(paths)[0].replace('$HOME', userhome) 47 | except IOError: 48 | VideoPath = os.path.join(userhome, 'Video') 49 | MusicPath = os.path.join(userhome, 'Music') 50 | 51 | 52 | def Notify(text): 53 | try: 54 | n = pynotify.Notification('Youtube-dl', text, 'dialog-info') 55 | n.set_timeout(1) 56 | n.show() 57 | except: 58 | print "you don't seem to have pynotify installed" 59 | 60 | 61 | class ServerThread(threading.Thread): 62 | 63 | def __init__(self, port): 64 | self.port = port 65 | threading.Thread.__init__(self) 66 | 67 | def run(self): 68 | self.server = StoppableHttpServer(('localhost', self.port), 69 | StoppableHttpServerRequestHandler) 70 | Notify("Server started on 'http://localhost:" + str(self.port) + "'") 71 | webbrowser.open('http://localhost:' + str(self.port)) 72 | self.server.serve_forever() 73 | 74 | def serverIsAlive(self): 75 | return self.server.shouldRun 76 | 77 | 78 | class StoppableHttpServer(HTTPServer): 79 | 80 | def serve_forever(self): 81 | self.shouldRun = True 82 | while self.shouldRun: 83 | self.handle_request() 84 | 85 | 86 | class StoppableHttpServerRequestHandler(BaseHTTPRequestHandler): 87 | 88 | # don't log stuff 89 | 90 | def log_message(self, format, *args): 91 | pass 92 | 93 | def do_GET(self): 94 | try: 95 | if self.path.endswith('.html') or self.path.endswith('.crx') or self.path == '/': 96 | 97 | # rewrite url 98 | 99 | if self.path == '/': 100 | self.path = '/index.html' 101 | 102 | f = open(sys.path[0] + self.path) 103 | 104 | if self.path.endswith('.crx'): 105 | self._sendHeader(200, 'application/x-chrome-extension') 106 | else: 107 | self._sendHeader(200, 'text/html') 108 | 109 | self.wfile.write(f.read()) 110 | f.close() 111 | return 112 | elif self.path == '/die': 113 | Notify('Server kill request received') 114 | self._sendHeader(200, 'text/plain') 115 | self.wfile.write('Server kill request received') 116 | self.server.shouldRun = False 117 | return 118 | elif self._tryResponse('music320mp3', Music320mp3): 119 | pass 120 | elif self._tryResponse('music320aac', Music320aac): 121 | pass 122 | elif self._tryResponse('musicbest', MusicBest): 123 | pass 124 | elif self._tryResponse('video', Video): 125 | pass 126 | else: 127 | raise IOError 128 | return 129 | except IOError: 130 | self.send_error(404, 'File Not Found: %s' % self.path) 131 | 132 | def _sendHeader(self, response_code, content_type): 133 | self.send_response(response_code) 134 | self.send_header('Content-type', content_type) 135 | self.end_headers() 136 | 137 | def _tryResponse(self, handler, postprocesor): 138 | if '?' + handler + '=' in self.path: 139 | self._sendHeader(200, 'text/plain') 140 | self.wfile.write('Downloading of "' + self.path + '" started') 141 | run = postprocesor(self.path) 142 | run.start() 143 | return True 144 | else: 145 | return False 146 | 147 | 148 | ##Music handler 149 | 150 | class Music320mp3(threading.Thread): 151 | 152 | def __init__(self, url): 153 | self.url = urlparse(url).query.replace('music320mp3=', '') 154 | self.command = '%s -t "%s" --extract-audio --audio-format mp3 --audio-quality 320k' \ 155 | % (sys.argv[0], self.url) 156 | Notify('Downloading of "' + self.url + '" started') 157 | threading.Thread.__init__(self) 158 | 159 | def run(self): 160 | subprocess.call(shlex.split(self.command)) 161 | 162 | 163 | class Music320aac(threading.Thread): 164 | 165 | def __init__(self, url): 166 | self.url = urlparse(url).query.replace('music320aac=', '') 167 | self.command = '%s -t "%s" --extract-audio --audio-format aac --audio-quality 320k' \ 168 | % (sys.argv[0], self.url) 169 | Notify('Downloading of "' + self.url + '" started') 170 | threading.Thread.__init__(self) 171 | 172 | def run(self): 173 | subprocess.call(shlex.split(self.command)) 174 | 175 | 176 | class MusicBest(threading.Thread): 177 | 178 | def __init__(self, url): 179 | self.url = urlparse(url).query.replace('musicbest=', '') 180 | self.command = '%s -t "%s" --extract-audio' % (sys.argv[0], self.url) 181 | Notify('Downloading of "' + self.url + '" started') 182 | threading.Thread.__init__(self) 183 | 184 | def run(self): 185 | subprocess.call(shlex.split(self.command)) 186 | 187 | class Video(threading.Thread): 188 | 189 | def __init__(self, url): 190 | self.url = urlparse(url).query.replace('video=', '') 191 | self.command = '%s -t "%s"' % (sys.argv[0], self.url) 192 | Notify('Downloading of "' + self.url + '" started') 193 | threading.Thread.__init__(self) 194 | 195 | def run(self): 196 | subprocess.call(shlex.split(self.command)) 197 | 198 | ## Main routine ## 199 | 200 | def main(): 201 | try: 202 | server = ServerThread(9099) 203 | server.start() 204 | except: 205 | sys.exit(0) 206 | 207 | 208 | if __name__ == '__main__': 209 | try: 210 | if len(sys.argv) > 1: 211 | youtube_dl.main() 212 | else: 213 | main() 214 | except: 215 | print 'Unknow error' 216 | -------------------------------------------------------------------------------- /youtube_dl/PostProcessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import subprocess 6 | import sys 7 | import time 8 | 9 | from utils import * 10 | 11 | 12 | class PostProcessor(object): 13 | """Post Processor class. 14 | 15 | PostProcessor objects can be added to downloaders with their 16 | add_post_processor() method. When the downloader has finished a 17 | successful download, it will take its internal chain of PostProcessors 18 | and start calling the run() method on each one of them, first with 19 | an initial argument and then with the returned value of the previous 20 | PostProcessor. 21 | 22 | The chain will be stopped if one of them ever returns None or the end 23 | of the chain is reached. 24 | 25 | PostProcessor objects follow a "mutual registration" process similar 26 | to InfoExtractor objects. 27 | """ 28 | 29 | _downloader = None 30 | 31 | def __init__(self, downloader=None): 32 | self._downloader = downloader 33 | 34 | def set_downloader(self, downloader): 35 | """Sets the downloader for this PP.""" 36 | self._downloader = downloader 37 | 38 | def run(self, information): 39 | """Run the PostProcessor. 40 | 41 | The "information" argument is a dictionary like the ones 42 | composed by InfoExtractors. The only difference is that this 43 | one has an extra field called "filepath" that points to the 44 | downloaded file. 45 | 46 | When this method returns None, the postprocessing chain is 47 | stopped. However, this method may return an information 48 | dictionary that will be passed to the next postprocessing 49 | object in the chain. It can be the one it received after 50 | changing some fields. 51 | 52 | In addition, this method may raise a PostProcessingError 53 | exception that will be taken into account by the downloader 54 | it was called from. 55 | """ 56 | return information # by default, do nothing 57 | 58 | class AudioConversionError(BaseException): 59 | def __init__(self, message): 60 | self.message = message 61 | 62 | class FFmpegExtractAudioPP(PostProcessor): 63 | def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False): 64 | PostProcessor.__init__(self, downloader) 65 | if preferredcodec is None: 66 | preferredcodec = 'best' 67 | self._preferredcodec = preferredcodec 68 | self._preferredquality = preferredquality 69 | self._keepvideo = keepvideo 70 | self._exes = self.detect_executables() 71 | 72 | @staticmethod 73 | def detect_executables(): 74 | available = {'avprobe' : False, 'avconv' : False, 'ffmpeg' : False, 'ffprobe' : False} 75 | for path in os.environ["PATH"].split(os.pathsep): 76 | for program in available.keys(): 77 | exe_file = os.path.join(path, program) 78 | if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK): 79 | available[program] = exe_file 80 | return available 81 | 82 | def get_audio_codec(self, path): 83 | if not self._exes['ffprobe'] and not self._exes['avprobe']: return None 84 | try: 85 | cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', '--', encodeFilename(path)] 86 | handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE) 87 | output = handle.communicate()[0] 88 | if handle.wait() != 0: 89 | return None 90 | except (IOError, OSError): 91 | return None 92 | audio_codec = None 93 | for line in output.split('\n'): 94 | if line.startswith('codec_name='): 95 | audio_codec = line.split('=')[1].strip() 96 | elif line.strip() == 'codec_type=audio' and audio_codec is not None: 97 | return audio_codec 98 | return None 99 | 100 | def run_ffmpeg(self, path, out_path, codec, more_opts): 101 | if not self._exes['ffmpeg'] and not self._exes['avconv']: 102 | raise AudioConversionError('ffmpeg or avconv not found. Please install one.') 103 | if codec is None: 104 | acodec_opts = [] 105 | else: 106 | acodec_opts = ['-acodec', codec] 107 | cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y', '-i', encodeFilename(path), '-vn'] 108 | + acodec_opts + more_opts + 109 | ['--', encodeFilename(out_path)]) 110 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 111 | stdout,stderr = p.communicate() 112 | if p.returncode != 0: 113 | msg = stderr.strip().split('\n')[-1] 114 | raise AudioConversionError(msg) 115 | 116 | def run(self, information): 117 | path = information['filepath'] 118 | 119 | filecodec = self.get_audio_codec(path) 120 | if filecodec is None: 121 | self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe') 122 | return None 123 | 124 | more_opts = [] 125 | if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'): 126 | if self._preferredcodec == 'm4a' and filecodec == 'aac': 127 | # Lossless, but in another container 128 | acodec = 'copy' 129 | extension = self._preferredcodec 130 | more_opts = [self._exes['avconv'] and '-bsf:a' or '-absf', 'aac_adtstoasc'] 131 | elif filecodec in ['aac', 'mp3', 'vorbis']: 132 | # Lossless if possible 133 | acodec = 'copy' 134 | extension = filecodec 135 | if filecodec == 'aac': 136 | more_opts = ['-f', 'adts'] 137 | if filecodec == 'vorbis': 138 | extension = 'ogg' 139 | else: 140 | # MP3 otherwise. 141 | acodec = 'libmp3lame' 142 | extension = 'mp3' 143 | more_opts = [] 144 | if self._preferredquality is not None: 145 | more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality] 146 | else: 147 | # We convert the audio (lossy) 148 | acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec] 149 | extension = self._preferredcodec 150 | more_opts = [] 151 | if self._preferredquality is not None: 152 | more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality] 153 | if self._preferredcodec == 'aac': 154 | more_opts += ['-f', 'adts'] 155 | if self._preferredcodec == 'm4a': 156 | more_opts += [self._exes['avconv'] and '-bsf:a' or '-absf', 'aac_adtstoasc'] 157 | if self._preferredcodec == 'vorbis': 158 | extension = 'ogg' 159 | if self._preferredcodec == 'wav': 160 | extension = 'wav' 161 | more_opts += ['-f', 'wav'] 162 | 163 | prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups 164 | new_path = prefix + sep + extension 165 | self._downloader.to_screen(u'[' + (self._exes['avconv'] and 'avconv' or 'ffmpeg') + '] Destination: ' + new_path) 166 | try: 167 | self.run_ffmpeg(path, new_path, acodec, more_opts) 168 | except: 169 | etype,e,tb = sys.exc_info() 170 | if isinstance(e, AudioConversionError): 171 | self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message) 172 | else: 173 | self._downloader.to_stderr(u'ERROR: error running ' + (self._exes['avconv'] and 'avconv' or 'ffmpeg')) 174 | return None 175 | 176 | # Try to update the date time for extracted audio file. 177 | if information.get('filetime') is not None: 178 | try: 179 | os.utime(encodeFilename(new_path), (time.time(), information['filetime'])) 180 | except: 181 | self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file') 182 | 183 | if not self._keepvideo: 184 | try: 185 | os.remove(encodeFilename(path)) 186 | except (IOError, OSError): 187 | self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file') 188 | return None 189 | 190 | information['filepath'] = new_path 191 | return information 192 | -------------------------------------------------------------------------------- /youtube_dl/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import gzip 5 | import htmlentitydefs 6 | import HTMLParser 7 | import locale 8 | import os 9 | import re 10 | import sys 11 | import zlib 12 | import urllib2 13 | import email.utils 14 | import json 15 | 16 | try: 17 | import cStringIO as StringIO 18 | except ImportError: 19 | import StringIO 20 | 21 | std_headers = { 22 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', 23 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 24 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 25 | 'Accept-Encoding': 'gzip, deflate', 26 | 'Accept-Language': 'en-us,en;q=0.5', 27 | } 28 | 29 | def preferredencoding(): 30 | """Get preferred encoding. 31 | 32 | Returns the best encoding scheme for the system, based on 33 | locale.getpreferredencoding() and some further tweaks. 34 | """ 35 | def yield_preferredencoding(): 36 | try: 37 | pref = locale.getpreferredencoding() 38 | u'TEST'.encode(pref) 39 | except: 40 | pref = 'UTF-8' 41 | while True: 42 | yield pref 43 | return yield_preferredencoding().next() 44 | 45 | 46 | def htmlentity_transform(matchobj): 47 | """Transforms an HTML entity to a Unicode character. 48 | 49 | This function receives a match object and is intended to be used with 50 | the re.sub() function. 51 | """ 52 | entity = matchobj.group(1) 53 | 54 | # Known non-numeric HTML entity 55 | if entity in htmlentitydefs.name2codepoint: 56 | return unichr(htmlentitydefs.name2codepoint[entity]) 57 | 58 | # Unicode character 59 | mobj = re.match(ur'(?u)#(x?\d+)', entity) 60 | if mobj is not None: 61 | numstr = mobj.group(1) 62 | if numstr.startswith(u'x'): 63 | base = 16 64 | numstr = u'0%s' % numstr 65 | else: 66 | base = 10 67 | return unichr(long(numstr, base)) 68 | 69 | # Unknown entity in name, return its literal representation 70 | return (u'&%s;' % entity) 71 | 72 | HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix 73 | class IDParser(HTMLParser.HTMLParser): 74 | """Modified HTMLParser that isolates a tag with the specified id""" 75 | def __init__(self, id): 76 | self.id = id 77 | self.result = None 78 | self.started = False 79 | self.depth = {} 80 | self.html = None 81 | self.watch_startpos = False 82 | self.error_count = 0 83 | HTMLParser.HTMLParser.__init__(self) 84 | 85 | def error(self, message): 86 | print >> sys.stderr, self.getpos() 87 | if self.error_count > 10 or self.started: 88 | raise HTMLParser.HTMLParseError(message, self.getpos()) 89 | self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line 90 | self.error_count += 1 91 | self.goahead(1) 92 | 93 | def loads(self, html): 94 | self.html = html 95 | self.feed(html) 96 | self.close() 97 | 98 | def handle_starttag(self, tag, attrs): 99 | attrs = dict(attrs) 100 | if self.started: 101 | self.find_startpos(None) 102 | if 'id' in attrs and attrs['id'] == self.id: 103 | self.result = [tag] 104 | self.started = True 105 | self.watch_startpos = True 106 | if self.started: 107 | if not tag in self.depth: self.depth[tag] = 0 108 | self.depth[tag] += 1 109 | 110 | def handle_endtag(self, tag): 111 | if self.started: 112 | if tag in self.depth: self.depth[tag] -= 1 113 | if self.depth[self.result[0]] == 0: 114 | self.started = False 115 | self.result.append(self.getpos()) 116 | 117 | def find_startpos(self, x): 118 | """Needed to put the start position of the result (self.result[1]) 119 | after the opening tag with the requested id""" 120 | if self.watch_startpos: 121 | self.watch_startpos = False 122 | self.result.append(self.getpos()) 123 | handle_entityref = handle_charref = handle_data = handle_comment = \ 124 | handle_decl = handle_pi = unknown_decl = find_startpos 125 | 126 | def get_result(self): 127 | if self.result == None: return None 128 | if len(self.result) != 3: return None 129 | lines = self.html.split('\n') 130 | lines = lines[self.result[1][0]-1:self.result[2][0]] 131 | lines[0] = lines[0][self.result[1][1]:] 132 | if len(lines) == 1: 133 | lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] 134 | lines[-1] = lines[-1][:self.result[2][1]] 135 | return '\n'.join(lines).strip() 136 | 137 | def get_element_by_id(id, html): 138 | """Return the content of the tag with the specified id in the passed HTML document""" 139 | parser = IDParser(id) 140 | try: 141 | parser.loads(html) 142 | except HTMLParser.HTMLParseError: 143 | pass 144 | return parser.get_result() 145 | 146 | 147 | def clean_html(html): 148 | """Clean an HTML snippet into a readable string""" 149 | # Newline vs
150 | html = html.replace('\n', ' ') 151 | html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) 152 | # Strip html tags 153 | html = re.sub('<.*?>', '', html) 154 | # Replace html entities 155 | html = unescapeHTML(html) 156 | return html 157 | 158 | 159 | def sanitize_open(filename, open_mode): 160 | """Try to open the given filename, and slightly tweak it if this fails. 161 | 162 | Attempts to open the given filename. If this fails, it tries to change 163 | the filename slightly, step by step, until it's either able to open it 164 | or it fails and raises a final exception, like the standard open() 165 | function. 166 | 167 | It returns the tuple (stream, definitive_file_name). 168 | """ 169 | try: 170 | if filename == u'-': 171 | if sys.platform == 'win32': 172 | import msvcrt 173 | msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) 174 | return (sys.stdout, filename) 175 | stream = open(encodeFilename(filename), open_mode) 176 | return (stream, filename) 177 | except (IOError, OSError), err: 178 | # In case of error, try to remove win32 forbidden chars 179 | filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) 180 | 181 | # An exception here should be caught in the caller 182 | stream = open(encodeFilename(filename), open_mode) 183 | return (stream, filename) 184 | 185 | 186 | def timeconvert(timestr): 187 | """Convert RFC 2822 defined time string into system timestamp""" 188 | timestamp = None 189 | timetuple = email.utils.parsedate_tz(timestr) 190 | if timetuple is not None: 191 | timestamp = email.utils.mktime_tz(timetuple) 192 | return timestamp 193 | 194 | def sanitize_filename(s): 195 | """Sanitizes a string so it could be used as part of a filename.""" 196 | def replace_insane(char): 197 | if char in u' .\\/|?*<>:"' or ord(char) < 32: 198 | return '_' 199 | return char 200 | return u''.join(map(replace_insane, s)).strip('_') 201 | 202 | def orderedSet(iterable): 203 | """ Remove all duplicates from the input iterable """ 204 | res = [] 205 | for el in iterable: 206 | if el not in res: 207 | res.append(el) 208 | return res 209 | 210 | def unescapeHTML(s): 211 | """ 212 | @param s a string (of type unicode) 213 | """ 214 | assert type(s) == type(u'') 215 | 216 | result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) 217 | return result 218 | 219 | def encodeFilename(s): 220 | """ 221 | @param s The name of the file (of type unicode) 222 | """ 223 | 224 | assert type(s) == type(u'') 225 | 226 | if sys.platform == 'win32' and sys.getwindowsversion().major >= 5: 227 | # Pass u'' directly to use Unicode APIs on Windows 2000 and up 228 | # (Detecting Windows NT 4 is tricky because 'major >= 4' would 229 | # match Windows 9x series as well. Besides, NT 4 is obsolete.) 230 | return s 231 | else: 232 | return s.encode(sys.getfilesystemencoding(), 'ignore') 233 | 234 | class DownloadError(Exception): 235 | """Download Error exception. 236 | 237 | This exception may be thrown by FileDownloader objects if they are not 238 | configured to continue on errors. They will contain the appropriate 239 | error message. 240 | """ 241 | pass 242 | 243 | 244 | class SameFileError(Exception): 245 | """Same File exception. 246 | 247 | This exception will be thrown by FileDownloader objects if they detect 248 | multiple files would have to be downloaded to the same file on disk. 249 | """ 250 | pass 251 | 252 | 253 | class PostProcessingError(Exception): 254 | """Post Processing exception. 255 | 256 | This exception may be raised by PostProcessor's .run() method to 257 | indicate an error in the postprocessing task. 258 | """ 259 | pass 260 | 261 | class MaxDownloadsReached(Exception): 262 | """ --max-downloads limit has been reached. """ 263 | pass 264 | 265 | 266 | class UnavailableVideoError(Exception): 267 | """Unavailable Format exception. 268 | 269 | This exception will be thrown when a video is requested 270 | in a format that is not available for that video. 271 | """ 272 | pass 273 | 274 | 275 | class ContentTooShortError(Exception): 276 | """Content Too Short exception. 277 | 278 | This exception may be raised by FileDownloader objects when a file they 279 | download is too small for what the server announced first, indicating 280 | the connection was probably interrupted. 281 | """ 282 | # Both in bytes 283 | downloaded = None 284 | expected = None 285 | 286 | def __init__(self, downloaded, expected): 287 | self.downloaded = downloaded 288 | self.expected = expected 289 | 290 | 291 | class Trouble(Exception): 292 | """Trouble helper exception 293 | 294 | This is an exception to be handled with 295 | FileDownloader.trouble 296 | """ 297 | 298 | class YoutubeDLHandler(urllib2.HTTPHandler): 299 | """Handler for HTTP requests and responses. 300 | 301 | This class, when installed with an OpenerDirector, automatically adds 302 | the standard headers to every HTTP request and handles gzipped and 303 | deflated responses from web servers. If compression is to be avoided in 304 | a particular request, the original request in the program code only has 305 | to include the HTTP header "Youtubedl-No-Compression", which will be 306 | removed before making the real request. 307 | 308 | Part of this code was copied from: 309 | 310 | http://techknack.net/python-urllib2-handlers/ 311 | 312 | Andrew Rowls, the author of that code, agreed to release it to the 313 | public domain. 314 | """ 315 | 316 | @staticmethod 317 | def deflate(data): 318 | try: 319 | return zlib.decompress(data, -zlib.MAX_WBITS) 320 | except zlib.error: 321 | return zlib.decompress(data) 322 | 323 | @staticmethod 324 | def addinfourl_wrapper(stream, headers, url, code): 325 | if hasattr(urllib2.addinfourl, 'getcode'): 326 | return urllib2.addinfourl(stream, headers, url, code) 327 | ret = urllib2.addinfourl(stream, headers, url) 328 | ret.code = code 329 | return ret 330 | 331 | def http_request(self, req): 332 | for h in std_headers: 333 | if h in req.headers: 334 | del req.headers[h] 335 | req.add_header(h, std_headers[h]) 336 | if 'Youtubedl-no-compression' in req.headers: 337 | if 'Accept-encoding' in req.headers: 338 | del req.headers['Accept-encoding'] 339 | del req.headers['Youtubedl-no-compression'] 340 | return req 341 | 342 | def http_response(self, req, resp): 343 | old_resp = resp 344 | # gzip 345 | if resp.headers.get('Content-encoding', '') == 'gzip': 346 | gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') 347 | resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) 348 | resp.msg = old_resp.msg 349 | # deflate 350 | if resp.headers.get('Content-encoding', '') == 'deflate': 351 | gz = StringIO.StringIO(self.deflate(resp.read())) 352 | resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) 353 | resp.msg = old_resp.msg 354 | return resp 355 | -------------------------------------------------------------------------------- /youtube_dl/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __authors__ = ( 5 | 'Ricardo Garcia Gonzalez', 6 | 'Danny Colligan', 7 | 'Benjamin Johnson', 8 | 'Vasyl\' Vavrychuk', 9 | 'Witold Baryluk', 10 | 'Paweł Paprota', 11 | 'Gergely Imreh', 12 | 'Rogério Brito', 13 | 'Philipp Hagemeister', 14 | 'Sören Schulze', 15 | 'Kevin Ngo', 16 | 'Ori Avtalion', 17 | 'shizeeg', 18 | 'Filippo Valsorda', 19 | ) 20 | 21 | __license__ = 'Public Domain' 22 | __version__ = '2012.02.27' 23 | 24 | UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' 25 | UPDATE_URL_VERSION = 'https://raw.github.com/rg3/youtube-dl/master/LATEST_VERSION' 26 | UPDATE_URL_EXE = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl.exe' 27 | 28 | 29 | import cookielib 30 | import getpass 31 | import optparse 32 | import os 33 | import re 34 | import shlex 35 | import socket 36 | import subprocess 37 | import sys 38 | import urllib2 39 | import warnings 40 | 41 | from utils import * 42 | from FileDownloader import * 43 | from InfoExtractors import * 44 | from PostProcessor import * 45 | 46 | def updateSelf(downloader, filename): 47 | ''' Update the program file with the latest version from the repository ''' 48 | # Note: downloader only used for options 49 | 50 | if not os.access(filename, os.W_OK): 51 | sys.exit('ERROR: no write permissions on %s' % filename) 52 | 53 | downloader.to_screen(u'Updating to latest version...') 54 | 55 | urlv = urllib2.urlopen(UPDATE_URL_VERSION) 56 | newversion = urlv.read().strip() 57 | if newversion == __version__: 58 | downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')') 59 | return 60 | urlv.close() 61 | 62 | if hasattr(sys, "frozen"): #py2exe 63 | exe = os.path.abspath(filename) 64 | directory = os.path.dirname(exe) 65 | if not os.access(directory, os.W_OK): 66 | sys.exit('ERROR: no write permissions on %s' % directory) 67 | 68 | try: 69 | urlh = urllib2.urlopen(UPDATE_URL_EXE) 70 | newcontent = urlh.read() 71 | urlh.close() 72 | with open(exe + '.new', 'wb') as outf: 73 | outf.write(newcontent) 74 | except (IOError, OSError), err: 75 | sys.exit('ERROR: unable to download latest version') 76 | 77 | try: 78 | bat = os.path.join(directory, 'youtube-dl-updater.bat') 79 | b = open(bat, 'w') 80 | 81 | print >> b, """ 82 | echo Updating youtube-dl... 83 | ping 127.0.0.1 -n 5 -w 1000 > NUL 84 | move /Y "%s.new" "%s" 85 | del "%s" 86 | """ %(exe, exe, bat) 87 | 88 | b.close() 89 | 90 | os.startfile(bat) 91 | except (IOError, OSError), err: 92 | sys.exit('ERROR: unable to overwrite current version') 93 | 94 | else: 95 | try: 96 | urlh = urllib2.urlopen(UPDATE_URL) 97 | newcontent = urlh.read() 98 | urlh.close() 99 | except (IOError, OSError), err: 100 | sys.exit('ERROR: unable to download latest version') 101 | 102 | try: 103 | with open(filename, 'wb') as outf: 104 | outf.write(newcontent) 105 | except (IOError, OSError), err: 106 | sys.exit('ERROR: unable to overwrite current version') 107 | 108 | downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.') 109 | 110 | def parseOpts(): 111 | def _readOptions(filename_bytes): 112 | try: 113 | optionf = open(filename_bytes) 114 | except IOError: 115 | return [] # silently skip if file is not present 116 | try: 117 | res = [] 118 | for l in optionf: 119 | res += shlex.split(l, comments=True) 120 | finally: 121 | optionf.close() 122 | return res 123 | 124 | def _format_option_string(option): 125 | ''' ('-o', '--option') -> -o, --format METAVAR''' 126 | 127 | opts = [] 128 | 129 | if option._short_opts: opts.append(option._short_opts[0]) 130 | if option._long_opts: opts.append(option._long_opts[0]) 131 | if len(opts) > 1: opts.insert(1, ', ') 132 | 133 | if option.takes_value(): opts.append(' %s' % option.metavar) 134 | 135 | return "".join(opts) 136 | 137 | def _find_term_columns(): 138 | columns = os.environ.get('COLUMNS', None) 139 | if columns: 140 | return int(columns) 141 | 142 | try: 143 | sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 144 | out,err = sp.communicate() 145 | return int(out.split()[1]) 146 | except: 147 | pass 148 | return None 149 | 150 | max_width = 80 151 | max_help_position = 80 152 | 153 | # No need to wrap help messages if we're on a wide console 154 | columns = _find_term_columns() 155 | if columns: max_width = columns 156 | 157 | fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) 158 | fmt.format_option_strings = _format_option_string 159 | 160 | kw = { 161 | 'version' : __version__, 162 | 'formatter' : fmt, 163 | 'usage' : '%prog [options] url [url...]', 164 | 'conflict_handler' : 'resolve', 165 | } 166 | 167 | parser = optparse.OptionParser(**kw) 168 | 169 | # option groups 170 | general = optparse.OptionGroup(parser, 'General Options') 171 | selection = optparse.OptionGroup(parser, 'Video Selection') 172 | authentication = optparse.OptionGroup(parser, 'Authentication Options') 173 | video_format = optparse.OptionGroup(parser, 'Video Format Options') 174 | postproc = optparse.OptionGroup(parser, 'Post-processing Options') 175 | filesystem = optparse.OptionGroup(parser, 'Filesystem Options') 176 | verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') 177 | 178 | general.add_option('-h', '--help', 179 | action='help', help='print this help text and exit') 180 | general.add_option('-v', '--version', 181 | action='version', help='print program version and exit') 182 | general.add_option('-U', '--update', 183 | action='store_true', dest='update_self', help='update this program to latest version') 184 | general.add_option('-i', '--ignore-errors', 185 | action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) 186 | general.add_option('-r', '--rate-limit', 187 | dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)') 188 | general.add_option('-R', '--retries', 189 | dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10) 190 | general.add_option('--dump-user-agent', 191 | action='store_true', dest='dump_user_agent', 192 | help='display the current browser identification', default=False) 193 | general.add_option('--list-extractors', 194 | action='store_true', dest='list_extractors', 195 | help='List all supported extractors and the URLs they would handle', default=False) 196 | 197 | selection.add_option('--playlist-start', 198 | dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) 199 | selection.add_option('--playlist-end', 200 | dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) 201 | selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') 202 | selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') 203 | selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None) 204 | 205 | authentication.add_option('-u', '--username', 206 | dest='username', metavar='USERNAME', help='account username') 207 | authentication.add_option('-p', '--password', 208 | dest='password', metavar='PASSWORD', help='account password') 209 | authentication.add_option('-n', '--netrc', 210 | action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False) 211 | 212 | 213 | video_format.add_option('-f', '--format', 214 | action='store', dest='format', metavar='FORMAT', help='video format code') 215 | video_format.add_option('--all-formats', 216 | action='store_const', dest='format', help='download all available video formats', const='all') 217 | video_format.add_option('--prefer-free-formats', 218 | action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested') 219 | video_format.add_option('--max-quality', 220 | action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') 221 | video_format.add_option('-F', '--list-formats', 222 | action='store_true', dest='listformats', help='list all available formats (currently youtube only)') 223 | video_format.add_option('--write-srt', 224 | action='store_true', dest='writesubtitles', 225 | help='write video closed captions to a .srt file (currently youtube only)', default=False) 226 | video_format.add_option('--srt-lang', 227 | action='store', dest='subtitleslang', metavar='LANG', 228 | help='language of the closed captions to download (optional) use IETF language tags like \'en\'') 229 | 230 | 231 | verbosity.add_option('-q', '--quiet', 232 | action='store_true', dest='quiet', help='activates quiet mode', default=False) 233 | verbosity.add_option('-s', '--simulate', 234 | action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False) 235 | verbosity.add_option('--skip-download', 236 | action='store_true', dest='skip_download', help='do not download the video', default=False) 237 | verbosity.add_option('-g', '--get-url', 238 | action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False) 239 | verbosity.add_option('-e', '--get-title', 240 | action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False) 241 | verbosity.add_option('--get-thumbnail', 242 | action='store_true', dest='getthumbnail', 243 | help='simulate, quiet but print thumbnail URL', default=False) 244 | verbosity.add_option('--get-description', 245 | action='store_true', dest='getdescription', 246 | help='simulate, quiet but print video description', default=False) 247 | verbosity.add_option('--get-filename', 248 | action='store_true', dest='getfilename', 249 | help='simulate, quiet but print output filename', default=False) 250 | verbosity.add_option('--get-format', 251 | action='store_true', dest='getformat', 252 | help='simulate, quiet but print output format', default=False) 253 | verbosity.add_option('--no-progress', 254 | action='store_true', dest='noprogress', help='do not print progress bar', default=False) 255 | verbosity.add_option('--console-title', 256 | action='store_true', dest='consoletitle', 257 | help='display progress in console titlebar', default=False) 258 | verbosity.add_option('-v', '--verbose', 259 | action='store_true', dest='verbose', help='print various debugging information', default=False) 260 | 261 | 262 | filesystem.add_option('-t', '--title', 263 | action='store_true', dest='usetitle', help='use title in file name', default=False) 264 | filesystem.add_option('-l', '--literal', 265 | action='store_true', dest='useliteral', help='use literal title in file name', default=False) 266 | filesystem.add_option('-A', '--auto-number', 267 | action='store_true', dest='autonumber', 268 | help='number downloaded files starting from 00000', default=False) 269 | filesystem.add_option('-o', '--output', 270 | dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.') 271 | filesystem.add_option('-a', '--batch-file', 272 | dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') 273 | filesystem.add_option('-w', '--no-overwrites', 274 | action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) 275 | filesystem.add_option('-c', '--continue', 276 | action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True) 277 | filesystem.add_option('--no-continue', 278 | action='store_false', dest='continue_dl', 279 | help='do not resume partially downloaded files (restart from beginning)') 280 | filesystem.add_option('--cookies', 281 | dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in') 282 | filesystem.add_option('--no-part', 283 | action='store_true', dest='nopart', help='do not use .part files', default=False) 284 | filesystem.add_option('--no-mtime', 285 | action='store_false', dest='updatetime', 286 | help='do not use the Last-modified header to set the file modification time', default=True) 287 | filesystem.add_option('--write-description', 288 | action='store_true', dest='writedescription', 289 | help='write video description to a .description file', default=False) 290 | filesystem.add_option('--write-info-json', 291 | action='store_true', dest='writeinfojson', 292 | help='write video metadata to a .info.json file', default=False) 293 | 294 | 295 | postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False, 296 | help='convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') 297 | postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best', 298 | help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default') 299 | postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K', 300 | help='ffmpeg/avconv audio bitrate specification, 128k by default') 301 | postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False, 302 | help='keeps the video file on disk after the post-processing; the video is erased by default') 303 | 304 | 305 | parser.add_option_group(general) 306 | parser.add_option_group(selection) 307 | parser.add_option_group(filesystem) 308 | parser.add_option_group(verbosity) 309 | parser.add_option_group(video_format) 310 | parser.add_option_group(authentication) 311 | parser.add_option_group(postproc) 312 | 313 | xdg_config_home = os.environ.get('XDG_CONFIG_HOME') 314 | if xdg_config_home: 315 | userConf = os.path.join(xdg_config_home, 'youtube-dl.conf') 316 | else: 317 | userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') 318 | argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:] 319 | opts, args = parser.parse_args(argv) 320 | 321 | return parser, opts, args 322 | 323 | def gen_extractors(): 324 | """ Return a list of an instance of every supported extractor. 325 | The order does matter; the first extractor matched is the one handling the URL. 326 | """ 327 | return [ 328 | YoutubePlaylistIE(), 329 | YoutubeUserIE(), 330 | YoutubeSearchIE(), 331 | YoutubeIE(), 332 | MetacafeIE(), 333 | DailymotionIE(), 334 | GoogleIE(), 335 | GoogleSearchIE(), 336 | PhotobucketIE(), 337 | YahooIE(), 338 | YahooSearchIE(), 339 | DepositFilesIE(), 340 | FacebookIE(), 341 | BlipTVIE(), 342 | VimeoIE(), 343 | MyVideoIE(), 344 | ComedyCentralIE(), 345 | EscapistIE(), 346 | CollegeHumorIE(), 347 | XVideosIE(), 348 | SoundcloudIE(), 349 | InfoQIE(), 350 | MixcloudIE(), 351 | StanfordOpenClassroomIE(), 352 | MTVIE(), 353 | 354 | GenericIE() 355 | ] 356 | 357 | def _real_main(): 358 | parser, opts, args = parseOpts() 359 | 360 | # Open appropriate CookieJar 361 | if opts.cookiefile is None: 362 | jar = cookielib.CookieJar() 363 | else: 364 | try: 365 | jar = cookielib.MozillaCookieJar(opts.cookiefile) 366 | if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK): 367 | jar.load() 368 | except (IOError, OSError), err: 369 | sys.exit(u'ERROR: unable to open cookie file') 370 | 371 | # Dump user agent 372 | if opts.dump_user_agent: 373 | print std_headers['User-Agent'] 374 | sys.exit(0) 375 | 376 | # Batch file verification 377 | batchurls = [] 378 | if opts.batchfile is not None: 379 | try: 380 | if opts.batchfile == '-': 381 | batchfd = sys.stdin 382 | else: 383 | batchfd = open(opts.batchfile, 'r') 384 | batchurls = batchfd.readlines() 385 | batchurls = [x.strip() for x in batchurls] 386 | batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] 387 | except IOError: 388 | sys.exit(u'ERROR: batch file could not be read') 389 | all_urls = batchurls + args 390 | all_urls = map(lambda url: url.strip(), all_urls) 391 | 392 | # General configuration 393 | cookie_processor = urllib2.HTTPCookieProcessor(jar) 394 | proxy_handler = urllib2.ProxyHandler() 395 | opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) 396 | urllib2.install_opener(opener) 397 | socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) 398 | 399 | extractors = gen_extractors() 400 | 401 | if opts.list_extractors: 402 | for ie in extractors: 403 | print(ie.IE_NAME) 404 | matchedUrls = filter(lambda url: ie.suitable(url), all_urls) 405 | all_urls = filter(lambda url: url not in matchedUrls, all_urls) 406 | for mu in matchedUrls: 407 | print(u' ' + mu) 408 | sys.exit(0) 409 | 410 | # Conflicting, missing and erroneous options 411 | if opts.usenetrc and (opts.username is not None or opts.password is not None): 412 | parser.error(u'using .netrc conflicts with giving username/password') 413 | if opts.password is not None and opts.username is None: 414 | parser.error(u'account username missing') 415 | if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber): 416 | parser.error(u'using output template conflicts with using title, literal title or auto number') 417 | if opts.usetitle and opts.useliteral: 418 | parser.error(u'using title conflicts with using literal title') 419 | if opts.username is not None and opts.password is None: 420 | opts.password = getpass.getpass(u'Type account password and press return:') 421 | if opts.ratelimit is not None: 422 | numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) 423 | if numeric_limit is None: 424 | parser.error(u'invalid rate limit specified') 425 | opts.ratelimit = numeric_limit 426 | if opts.retries is not None: 427 | try: 428 | opts.retries = long(opts.retries) 429 | except (TypeError, ValueError), err: 430 | parser.error(u'invalid retry count specified') 431 | try: 432 | opts.playliststart = int(opts.playliststart) 433 | if opts.playliststart <= 0: 434 | raise ValueError(u'Playlist start must be positive') 435 | except (TypeError, ValueError), err: 436 | parser.error(u'invalid playlist start number specified') 437 | try: 438 | opts.playlistend = int(opts.playlistend) 439 | if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): 440 | raise ValueError(u'Playlist end must be greater than playlist start') 441 | except (TypeError, ValueError), err: 442 | parser.error(u'invalid playlist end number specified') 443 | if opts.extractaudio: 444 | if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']: 445 | parser.error(u'invalid audio format specified') 446 | 447 | # File downloader 448 | fd = FileDownloader({ 449 | 'usenetrc': opts.usenetrc, 450 | 'username': opts.username, 451 | 'password': opts.password, 452 | 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), 453 | 'forceurl': opts.geturl, 454 | 'forcetitle': opts.gettitle, 455 | 'forcethumbnail': opts.getthumbnail, 456 | 'forcedescription': opts.getdescription, 457 | 'forcefilename': opts.getfilename, 458 | 'forceformat': opts.getformat, 459 | 'simulate': opts.simulate, 460 | 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), 461 | 'format': opts.format, 462 | 'format_limit': opts.format_limit, 463 | 'listformats': opts.listformats, 464 | 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding())) 465 | or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s') 466 | or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s') 467 | or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s') 468 | or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s') 469 | or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s') 470 | or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s') 471 | or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s') 472 | or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') 473 | or u'%(id)s.%(ext)s'), 474 | 'ignoreerrors': opts.ignoreerrors, 475 | 'ratelimit': opts.ratelimit, 476 | 'nooverwrites': opts.nooverwrites, 477 | 'retries': opts.retries, 478 | 'continuedl': opts.continue_dl, 479 | 'noprogress': opts.noprogress, 480 | 'playliststart': opts.playliststart, 481 | 'playlistend': opts.playlistend, 482 | 'logtostderr': opts.outtmpl == '-', 483 | 'consoletitle': opts.consoletitle, 484 | 'nopart': opts.nopart, 485 | 'updatetime': opts.updatetime, 486 | 'writedescription': opts.writedescription, 487 | 'writeinfojson': opts.writeinfojson, 488 | 'writesubtitles': opts.writesubtitles, 489 | 'subtitleslang': opts.subtitleslang, 490 | 'matchtitle': opts.matchtitle, 491 | 'rejecttitle': opts.rejecttitle, 492 | 'max_downloads': opts.max_downloads, 493 | 'prefer_free_formats': opts.prefer_free_formats, 494 | 'verbose': opts.verbose, 495 | }) 496 | 497 | if opts.verbose: 498 | fd.to_screen(u'[debug] Proxy map: ' + str(proxy_handler.proxies)) 499 | 500 | for extractor in extractors: 501 | fd.add_info_extractor(extractor) 502 | 503 | # PostProcessors 504 | if opts.extractaudio: 505 | fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo)) 506 | 507 | # Update version 508 | if opts.update_self: 509 | updateSelf(fd, sys.argv[0]) 510 | 511 | # Maybe do nothing 512 | if len(all_urls) < 1: 513 | if not opts.update_self: 514 | parser.error(u'you must provide at least one URL') 515 | else: 516 | sys.exit() 517 | 518 | try: 519 | retcode = fd.download(all_urls) 520 | except MaxDownloadsReached: 521 | fd.to_screen(u'--max-download limit reached, aborting.') 522 | retcode = 101 523 | 524 | # Dump cookie jar if requested 525 | if opts.cookiefile is not None: 526 | try: 527 | jar.save() 528 | except (IOError, OSError), err: 529 | sys.exit(u'ERROR: unable to save cookie jar') 530 | 531 | sys.exit(retcode) 532 | 533 | def main(): 534 | try: 535 | _real_main() 536 | except DownloadError: 537 | sys.exit(1) 538 | except SameFileError: 539 | sys.exit(u'ERROR: fixed output name but more than one file to download') 540 | except KeyboardInterrupt: 541 | sys.exit(u'\nERROR: Interrupted by user') 542 | -------------------------------------------------------------------------------- /youtube_dl/FileDownloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import httplib 5 | import math 6 | import os 7 | import re 8 | import socket 9 | import subprocess 10 | import sys 11 | import time 12 | import urllib2 13 | 14 | if os.name == 'nt': 15 | import ctypes 16 | 17 | from utils import * 18 | 19 | 20 | class FileDownloader(object): 21 | """File Downloader class. 22 | 23 | File downloader objects are the ones responsible of downloading the 24 | actual video file and writing it to disk if the user has requested 25 | it, among some other tasks. In most cases there should be one per 26 | program. As, given a video URL, the downloader doesn't know how to 27 | extract all the needed information, task that InfoExtractors do, it 28 | has to pass the URL to one of them. 29 | 30 | For this, file downloader objects have a method that allows 31 | InfoExtractors to be registered in a given order. When it is passed 32 | a URL, the file downloader handles it to the first InfoExtractor it 33 | finds that reports being able to handle it. The InfoExtractor extracts 34 | all the information about the video or videos the URL refers to, and 35 | asks the FileDownloader to process the video information, possibly 36 | downloading the video. 37 | 38 | File downloaders accept a lot of parameters. In order not to saturate 39 | the object constructor with arguments, it receives a dictionary of 40 | options instead. These options are available through the params 41 | attribute for the InfoExtractors to use. The FileDownloader also 42 | registers itself as the downloader in charge for the InfoExtractors 43 | that are added to it, so this is a "mutual registration". 44 | 45 | Available options: 46 | 47 | username: Username for authentication purposes. 48 | password: Password for authentication purposes. 49 | usenetrc: Use netrc for authentication instead. 50 | quiet: Do not print messages to stdout. 51 | forceurl: Force printing final URL. 52 | forcetitle: Force printing title. 53 | forcethumbnail: Force printing thumbnail URL. 54 | forcedescription: Force printing description. 55 | forcefilename: Force printing final filename. 56 | simulate: Do not download the video files. 57 | format: Video format code. 58 | format_limit: Highest quality format to try. 59 | outtmpl: Template for output names. 60 | ignoreerrors: Do not stop on download errors. 61 | ratelimit: Download speed limit, in bytes/sec. 62 | nooverwrites: Prevent overwriting files. 63 | retries: Number of times to retry for HTTP error 5xx 64 | continuedl: Try to continue downloads if possible. 65 | noprogress: Do not print the progress bar. 66 | playliststart: Playlist item to start at. 67 | playlistend: Playlist item to end at. 68 | matchtitle: Download only matching titles. 69 | rejecttitle: Reject downloads for matching titles. 70 | logtostderr: Log messages to stderr instead of stdout. 71 | consoletitle: Display progress in console window's titlebar. 72 | nopart: Do not use temporary .part files. 73 | updatetime: Use the Last-modified header to set output file timestamps. 74 | writedescription: Write the video description to a .description file 75 | writeinfojson: Write the video description to a .info.json file 76 | writesubtitles: Write the video subtitles to a .srt file 77 | subtitleslang: Language of the subtitles to download 78 | """ 79 | 80 | params = None 81 | _ies = [] 82 | _pps = [] 83 | _download_retcode = None 84 | _num_downloads = None 85 | _screen_file = None 86 | 87 | def __init__(self, params): 88 | """Create a FileDownloader object with the given options.""" 89 | self._ies = [] 90 | self._pps = [] 91 | self._download_retcode = 0 92 | self._num_downloads = 0 93 | self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] 94 | self.params = params 95 | 96 | @staticmethod 97 | def format_bytes(bytes): 98 | if bytes is None: 99 | return 'N/A' 100 | if type(bytes) is str: 101 | bytes = float(bytes) 102 | if bytes == 0.0: 103 | exponent = 0 104 | else: 105 | exponent = long(math.log(bytes, 1024.0)) 106 | suffix = 'bkMGTPEZY'[exponent] 107 | converted = float(bytes) / float(1024 ** exponent) 108 | return '%.2f%s' % (converted, suffix) 109 | 110 | @staticmethod 111 | def calc_percent(byte_counter, data_len): 112 | if data_len is None: 113 | return '---.-%' 114 | return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) 115 | 116 | @staticmethod 117 | def calc_eta(start, now, total, current): 118 | if total is None: 119 | return '--:--' 120 | dif = now - start 121 | if current == 0 or dif < 0.001: # One millisecond 122 | return '--:--' 123 | rate = float(current) / dif 124 | eta = long((float(total) - float(current)) / rate) 125 | (eta_mins, eta_secs) = divmod(eta, 60) 126 | if eta_mins > 99: 127 | return '--:--' 128 | return '%02d:%02d' % (eta_mins, eta_secs) 129 | 130 | @staticmethod 131 | def calc_speed(start, now, bytes): 132 | dif = now - start 133 | if bytes == 0 or dif < 0.001: # One millisecond 134 | return '%10s' % '---b/s' 135 | return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) 136 | 137 | @staticmethod 138 | def best_block_size(elapsed_time, bytes): 139 | new_min = max(bytes / 2.0, 1.0) 140 | new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB 141 | if elapsed_time < 0.001: 142 | return long(new_max) 143 | rate = bytes / elapsed_time 144 | if rate > new_max: 145 | return long(new_max) 146 | if rate < new_min: 147 | return long(new_min) 148 | return long(rate) 149 | 150 | @staticmethod 151 | def parse_bytes(bytestr): 152 | """Parse a string indicating a byte quantity into a long integer.""" 153 | matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) 154 | if matchobj is None: 155 | return None 156 | number = float(matchobj.group(1)) 157 | multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) 158 | return long(round(number * multiplier)) 159 | 160 | def add_info_extractor(self, ie): 161 | """Add an InfoExtractor object to the end of the list.""" 162 | self._ies.append(ie) 163 | ie.set_downloader(self) 164 | 165 | def add_post_processor(self, pp): 166 | """Add a PostProcessor object to the end of the chain.""" 167 | self._pps.append(pp) 168 | pp.set_downloader(self) 169 | 170 | def to_screen(self, message, skip_eol=False): 171 | """Print message to stdout if not in quiet mode.""" 172 | assert type(message) == type(u'') 173 | if not self.params.get('quiet', False): 174 | terminator = [u'\n', u''][skip_eol] 175 | output = message + terminator 176 | 177 | if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr 178 | output = output.encode(preferredencoding(), 'ignore') 179 | self._screen_file.write(output) 180 | self._screen_file.flush() 181 | 182 | def to_stderr(self, message): 183 | """Print message to stderr.""" 184 | print >>sys.stderr, message.encode(preferredencoding()) 185 | 186 | def to_cons_title(self, message): 187 | """Set console/terminal window title to message.""" 188 | if not self.params.get('consoletitle', False): 189 | return 190 | if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): 191 | # c_wchar_p() might not be necessary if `message` is 192 | # already of type unicode() 193 | ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) 194 | elif 'TERM' in os.environ: 195 | sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding())) 196 | 197 | def fixed_template(self): 198 | """Checks if the output template is fixed.""" 199 | return (re.search(ur'(?u)%$.+?$s', self.params['outtmpl']) is None) 200 | 201 | def trouble(self, message=None): 202 | """Determine action to take when a download problem appears. 203 | 204 | Depending on if the downloader has been configured to ignore 205 | download errors or not, this method may throw an exception or 206 | not when errors are found, after printing the message. 207 | """ 208 | if message is not None: 209 | self.to_stderr(message) 210 | if not self.params.get('ignoreerrors', False): 211 | raise DownloadError(message) 212 | self._download_retcode = 1 213 | 214 | def slow_down(self, start_time, byte_counter): 215 | """Sleep if the download speed is over the rate limit.""" 216 | rate_limit = self.params.get('ratelimit', None) 217 | if rate_limit is None or byte_counter == 0: 218 | return 219 | now = time.time() 220 | elapsed = now - start_time 221 | if elapsed <= 0.0: 222 | return 223 | speed = float(byte_counter) / elapsed 224 | if speed > rate_limit: 225 | time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) 226 | 227 | def temp_name(self, filename): 228 | """Returns a temporary filename for the given filename.""" 229 | if self.params.get('nopart', False) or filename == u'-' or \ 230 | (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): 231 | return filename 232 | return filename + u'.part' 233 | 234 | def undo_temp_name(self, filename): 235 | if filename.endswith(u'.part'): 236 | return filename[:-len(u'.part')] 237 | return filename 238 | 239 | def try_rename(self, old_filename, new_filename): 240 | try: 241 | if old_filename == new_filename: 242 | return 243 | os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) 244 | except (IOError, OSError), err: 245 | self.trouble(u'ERROR: unable to rename file') 246 | 247 | def try_utime(self, filename, last_modified_hdr): 248 | """Try to set the last-modified time of the given file.""" 249 | if last_modified_hdr is None: 250 | return 251 | if not os.path.isfile(encodeFilename(filename)): 252 | return 253 | timestr = last_modified_hdr 254 | if timestr is None: 255 | return 256 | filetime = timeconvert(timestr) 257 | if filetime is None: 258 | return filetime 259 | try: 260 | os.utime(filename, (time.time(), filetime)) 261 | except: 262 | pass 263 | return filetime 264 | 265 | def report_writedescription(self, descfn): 266 | """ Report that the description file is being written """ 267 | self.to_screen(u'[info] Writing video description to: ' + descfn) 268 | 269 | def report_writesubtitles(self, srtfn): 270 | """ Report that the subtitles file is being written """ 271 | self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) 272 | 273 | def report_writeinfojson(self, infofn): 274 | """ Report that the metadata file has been written """ 275 | self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) 276 | 277 | def report_destination(self, filename): 278 | """Report destination filename.""" 279 | self.to_screen(u'[download] Destination: ' + filename) 280 | 281 | def report_progress(self, percent_str, data_len_str, speed_str, eta_str): 282 | """Report download progress.""" 283 | if self.params.get('noprogress', False): 284 | return 285 | self.to_screen(u'\r[download] %s of %s at %s ETA %s' % 286 | (percent_str, data_len_str, speed_str, eta_str), skip_eol=True) 287 | self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' % 288 | (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) 289 | 290 | def report_resuming_byte(self, resume_len): 291 | """Report attempt to resume at given byte.""" 292 | self.to_screen(u'[download] Resuming download at byte %s' % resume_len) 293 | 294 | def report_retry(self, count, retries): 295 | """Report retry in case of HTTP error 5xx""" 296 | self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) 297 | 298 | def report_file_already_downloaded(self, file_name): 299 | """Report file has already been fully downloaded.""" 300 | try: 301 | self.to_screen(u'[download] %s has already been downloaded' % file_name) 302 | except (UnicodeEncodeError), err: 303 | self.to_screen(u'[download] The file has already been downloaded') 304 | 305 | def report_unable_to_resume(self): 306 | """Report it was impossible to resume download.""" 307 | self.to_screen(u'[download] Unable to resume') 308 | 309 | def report_finish(self): 310 | """Report download finished.""" 311 | if self.params.get('noprogress', False): 312 | self.to_screen(u'[download] Download completed') 313 | else: 314 | self.to_screen(u'') 315 | 316 | def increment_downloads(self): 317 | """Increment the ordinal that assigns a number to each file.""" 318 | self._num_downloads += 1 319 | 320 | def prepare_filename(self, info_dict): 321 | """Generate the output filename.""" 322 | try: 323 | template_dict = dict(info_dict) 324 | template_dict['epoch'] = unicode(long(time.time())) 325 | template_dict['autonumber'] = unicode('%05d' % self._num_downloads) 326 | filename = self.params['outtmpl'] % template_dict 327 | return filename 328 | except (ValueError, KeyError), err: 329 | self.trouble(u'ERROR: invalid system charset or erroneous output template') 330 | return None 331 | 332 | def _match_entry(self, info_dict): 333 | """ Returns None iff the file should be downloaded """ 334 | 335 | title = info_dict['title'] 336 | matchtitle = self.params.get('matchtitle', False) 337 | if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): 338 | return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' 339 | rejecttitle = self.params.get('rejecttitle', False) 340 | if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): 341 | return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' 342 | return None 343 | 344 | def process_info(self, info_dict): 345 | """Process a single dictionary returned by an InfoExtractor.""" 346 | 347 | info_dict['stitle'] = sanitize_filename(info_dict['title']) 348 | 349 | reason = self._match_entry(info_dict) 350 | if reason is not None: 351 | self.to_screen(u'[download] ' + reason) 352 | return 353 | 354 | max_downloads = self.params.get('max_downloads') 355 | if max_downloads is not None: 356 | if self._num_downloads > int(max_downloads): 357 | raise MaxDownloadsReached() 358 | 359 | filename = self.prepare_filename(info_dict) 360 | 361 | # Forced printings 362 | if self.params.get('forcetitle', False): 363 | print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') 364 | if self.params.get('forceurl', False): 365 | print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') 366 | if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: 367 | print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') 368 | if self.params.get('forcedescription', False) and 'description' in info_dict: 369 | print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') 370 | if self.params.get('forcefilename', False) and filename is not None: 371 | print filename.encode(preferredencoding(), 'xmlcharrefreplace') 372 | if self.params.get('forceformat', False): 373 | print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace') 374 | 375 | # Do nothing else if in simulate mode 376 | if self.params.get('simulate', False): 377 | return 378 | 379 | if filename is None: 380 | return 381 | 382 | try: 383 | dn = os.path.dirname(encodeFilename(filename)) 384 | if dn != '' and not os.path.exists(dn): # dn is already encoded 385 | os.makedirs(dn) 386 | except (OSError, IOError), err: 387 | self.trouble(u'ERROR: unable to create directory ' + unicode(err)) 388 | return 389 | 390 | if self.params.get('writedescription', False): 391 | try: 392 | descfn = filename + u'.description' 393 | self.report_writedescription(descfn) 394 | descfile = open(encodeFilename(descfn), 'wb') 395 | try: 396 | descfile.write(info_dict['description'].encode('utf-8')) 397 | finally: 398 | descfile.close() 399 | except (OSError, IOError): 400 | self.trouble(u'ERROR: Cannot write description file ' + descfn) 401 | return 402 | 403 | if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: 404 | # subtitles download errors are already managed as troubles in relevant IE 405 | # that way it will silently go on when used with unsupporting IE 406 | try: 407 | srtfn = filename.rsplit('.', 1)[0] + u'.srt' 408 | self.report_writesubtitles(srtfn) 409 | srtfile = open(encodeFilename(srtfn), 'wb') 410 | try: 411 | srtfile.write(info_dict['subtitles'].encode('utf-8')) 412 | finally: 413 | srtfile.close() 414 | except (OSError, IOError): 415 | self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) 416 | return 417 | 418 | if self.params.get('writeinfojson', False): 419 | infofn = filename + u'.info.json' 420 | self.report_writeinfojson(infofn) 421 | try: 422 | json.dump 423 | except (NameError,AttributeError): 424 | self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.') 425 | return 426 | try: 427 | infof = open(encodeFilename(infofn), 'wb') 428 | try: 429 | json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',)) 430 | json.dump(json_info_dict, infof) 431 | finally: 432 | infof.close() 433 | except (OSError, IOError): 434 | self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn) 435 | return 436 | 437 | if not self.params.get('skip_download', False): 438 | if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)): 439 | success = True 440 | else: 441 | try: 442 | success = self._do_download(filename, info_dict) 443 | except (OSError, IOError), err: 444 | raise UnavailableVideoError 445 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 446 | self.trouble(u'ERROR: unable to download video data: %s' % str(err)) 447 | return 448 | except (ContentTooShortError, ), err: 449 | self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) 450 | return 451 | 452 | if success: 453 | try: 454 | self.post_process(filename, info_dict) 455 | except (PostProcessingError), err: 456 | self.trouble(u'ERROR: postprocessing: %s' % str(err)) 457 | return 458 | 459 | def download(self, url_list): 460 | """Download a given list of URLs.""" 461 | if len(url_list) > 1 and self.fixed_template(): 462 | raise SameFileError(self.params['outtmpl']) 463 | 464 | for url in url_list: 465 | suitable_found = False 466 | for ie in self._ies: 467 | # Go to next InfoExtractor if not suitable 468 | if not ie.suitable(url): 469 | continue 470 | 471 | # Suitable InfoExtractor found 472 | suitable_found = True 473 | 474 | # Extract information from URL and process it 475 | videos = ie.extract(url) 476 | for video in videos or []: 477 | try: 478 | self.increment_downloads() 479 | self.process_info(video) 480 | except UnavailableVideoError: 481 | self.trouble(u'\nERROR: unable to download video') 482 | 483 | # Suitable InfoExtractor had been found; go to next URL 484 | break 485 | 486 | if not suitable_found: 487 | self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url) 488 | 489 | return self._download_retcode 490 | 491 | def post_process(self, filename, ie_info): 492 | """Run the postprocessing chain on the given file.""" 493 | info = dict(ie_info) 494 | info['filepath'] = filename 495 | for pp in self._pps: 496 | info = pp.run(info) 497 | if info is None: 498 | break 499 | 500 | def _download_with_rtmpdump(self, filename, url, player_url): 501 | self.report_destination(filename) 502 | tmpfilename = self.temp_name(filename) 503 | 504 | # Check for rtmpdump first 505 | try: 506 | subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT) 507 | except (OSError, IOError): 508 | self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run') 509 | return False 510 | 511 | # Download using rtmpdump. rtmpdump returns exit code 2 when 512 | # the connection was interrumpted and resuming appears to be 513 | # possible. This is part of rtmpdump's normal usage, AFAIK. 514 | basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] 515 | args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)] 516 | if self.params.get('verbose', False): 517 | try: 518 | import pipes 519 | shell_quote = lambda args: ' '.join(map(pipes.quote, args)) 520 | except ImportError: 521 | shell_quote = repr 522 | self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) 523 | retval = subprocess.call(args) 524 | while retval == 2 or retval == 1: 525 | prevsize = os.path.getsize(encodeFilename(tmpfilename)) 526 | self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) 527 | time.sleep(5.0) # This seems to be needed 528 | retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) 529 | cursize = os.path.getsize(encodeFilename(tmpfilename)) 530 | if prevsize == cursize and retval == 1: 531 | break 532 | # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those 533 | if prevsize == cursize and retval == 2 and cursize > 1024: 534 | self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') 535 | retval = 0 536 | break 537 | if retval == 0: 538 | self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(encodeFilename(tmpfilename))) 539 | self.try_rename(tmpfilename, filename) 540 | return True 541 | else: 542 | self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval) 543 | return False 544 | 545 | def _do_download(self, filename, info_dict): 546 | url = info_dict['url'] 547 | player_url = info_dict.get('player_url', None) 548 | 549 | # Check file already present 550 | if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False): 551 | self.report_file_already_downloaded(filename) 552 | return True 553 | 554 | # Attempt to download using rtmpdump 555 | if url.startswith('rtmp'): 556 | return self._download_with_rtmpdump(filename, url, player_url) 557 | 558 | tmpfilename = self.temp_name(filename) 559 | stream = None 560 | 561 | # Do not include the Accept-Encoding header 562 | headers = {'Youtubedl-no-compression': 'True'} 563 | basic_request = urllib2.Request(url, None, headers) 564 | request = urllib2.Request(url, None, headers) 565 | 566 | # Establish possible resume length 567 | if os.path.isfile(encodeFilename(tmpfilename)): 568 | resume_len = os.path.getsize(encodeFilename(tmpfilename)) 569 | else: 570 | resume_len = 0 571 | 572 | open_mode = 'wb' 573 | if resume_len != 0: 574 | if self.params.get('continuedl', False): 575 | self.report_resuming_byte(resume_len) 576 | request.add_header('Range','bytes=%d-' % resume_len) 577 | open_mode = 'ab' 578 | else: 579 | resume_len = 0 580 | 581 | count = 0 582 | retries = self.params.get('retries', 0) 583 | while count <= retries: 584 | # Establish connection 585 | try: 586 | if count == 0 and 'urlhandle' in info_dict: 587 | data = info_dict['urlhandle'] 588 | data = urllib2.urlopen(request) 589 | break 590 | except (urllib2.HTTPError, ), err: 591 | if (err.code < 500 or err.code >= 600) and err.code != 416: 592 | # Unexpected HTTP error 593 | raise 594 | elif err.code == 416: 595 | # Unable to resume (requested range not satisfiable) 596 | try: 597 | # Open the connection again without the range header 598 | data = urllib2.urlopen(basic_request) 599 | content_length = data.info()['Content-Length'] 600 | except (urllib2.HTTPError, ), err: 601 | if err.code < 500 or err.code >= 600: 602 | raise 603 | else: 604 | # Examine the reported length 605 | if (content_length is not None and 606 | (resume_len - 100 < long(content_length) < resume_len + 100)): 607 | # The file had already been fully downloaded. 608 | # Explanation to the above condition: in issue #175 it was revealed that 609 | # YouTube sometimes adds or removes a few bytes from the end of the file, 610 | # changing the file size slightly and causing problems for some users. So 611 | # I decided to implement a suggested change and consider the file 612 | # completely downloaded if the file size differs less than 100 bytes from 613 | # the one in the hard drive. 614 | self.report_file_already_downloaded(filename) 615 | self.try_rename(tmpfilename, filename) 616 | return True 617 | else: 618 | # The length does not match, we start the download over 619 | self.report_unable_to_resume() 620 | open_mode = 'wb' 621 | break 622 | # Retry 623 | count += 1 624 | if count <= retries: 625 | self.report_retry(count, retries) 626 | 627 | if count > retries: 628 | self.trouble(u'ERROR: giving up after %s retries' % retries) 629 | return False 630 | 631 | data_len = data.info().get('Content-length', None) 632 | if data_len is not None: 633 | data_len = long(data_len) + resume_len 634 | data_len_str = self.format_bytes(data_len) 635 | byte_counter = 0 + resume_len 636 | block_size = 1024 637 | start = time.time() 638 | while True: 639 | # Download and write 640 | before = time.time() 641 | data_block = data.read(block_size) 642 | after = time.time() 643 | if len(data_block) == 0: 644 | break 645 | byte_counter += len(data_block) 646 | 647 | # Open file just in time 648 | if stream is None: 649 | try: 650 | (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) 651 | assert stream is not None 652 | filename = self.undo_temp_name(tmpfilename) 653 | self.report_destination(filename) 654 | except (OSError, IOError), err: 655 | self.trouble(u'ERROR: unable to open for writing: %s' % str(err)) 656 | return False 657 | try: 658 | stream.write(data_block) 659 | except (IOError, OSError), err: 660 | self.trouble(u'\nERROR: unable to write data: %s' % str(err)) 661 | return False 662 | block_size = self.best_block_size(after - before, len(data_block)) 663 | 664 | # Progress message 665 | speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) 666 | if data_len is None: 667 | self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') 668 | else: 669 | percent_str = self.calc_percent(byte_counter, data_len) 670 | eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) 671 | self.report_progress(percent_str, data_len_str, speed_str, eta_str) 672 | 673 | # Apply rate limit 674 | self.slow_down(start, byte_counter - resume_len) 675 | 676 | if stream is None: 677 | self.trouble(u'\nERROR: Did not get any data blocks') 678 | return False 679 | stream.close() 680 | self.report_finish() 681 | if data_len is not None and byte_counter != data_len: 682 | raise ContentTooShortError(byte_counter, long(data_len)) 683 | self.try_rename(tmpfilename, filename) 684 | 685 | # Update file modification time 686 | if self.params.get('updatetime', True): 687 | info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) 688 | 689 | return True 690 | -------------------------------------------------------------------------------- /youtube_dl/InfoExtractors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | import HTMLParser 6 | import httplib 7 | import netrc 8 | import os 9 | import re 10 | import socket 11 | import time 12 | import urllib 13 | import urllib2 14 | import email.utils 15 | import xml.etree.ElementTree 16 | from urlparse import parse_qs 17 | 18 | try: 19 | import cStringIO as StringIO 20 | except ImportError: 21 | import StringIO 22 | 23 | from utils import * 24 | 25 | 26 | class InfoExtractor(object): 27 | """Information Extractor class. 28 | 29 | Information extractors are the classes that, given a URL, extract 30 | information from the video (or videos) the URL refers to. This 31 | information includes the real video URL, the video title and simplified 32 | title, author and others. The information is stored in a dictionary 33 | which is then passed to the FileDownloader. The FileDownloader 34 | processes this information possibly downloading the video to the file 35 | system, among other possible outcomes. The dictionaries must include 36 | the following fields: 37 | 38 | id: Video identifier. 39 | url: Final video URL. 40 | uploader: Nickname of the video uploader. 41 | title: Literal title. 42 | ext: Video filename extension. 43 | format: Video format. 44 | player_url: SWF Player URL (may be None). 45 | 46 | The following fields are optional. Their primary purpose is to allow 47 | youtube-dl to serve as the backend for a video search function, such 48 | as the one in youtube2mp3. They are only used when their respective 49 | forced printing functions are called: 50 | 51 | thumbnail: Full URL to a video thumbnail image. 52 | description: One-line video description. 53 | 54 | Subclasses of this one should re-define the _real_initialize() and 55 | _real_extract() methods and define a _VALID_URL regexp. 56 | Probably, they should also be added to the list of extractors. 57 | """ 58 | 59 | _ready = False 60 | _downloader = None 61 | 62 | def __init__(self, downloader=None): 63 | """Constructor. Receives an optional downloader.""" 64 | self._ready = False 65 | self.set_downloader(downloader) 66 | 67 | def suitable(self, url): 68 | """Receives a URL and returns True if suitable for this IE.""" 69 | return re.match(self._VALID_URL, url) is not None 70 | 71 | def initialize(self): 72 | """Initializes an instance (authentication, etc).""" 73 | if not self._ready: 74 | self._real_initialize() 75 | self._ready = True 76 | 77 | def extract(self, url): 78 | """Extracts URL information and returns it in list of dicts.""" 79 | self.initialize() 80 | return self._real_extract(url) 81 | 82 | def set_downloader(self, downloader): 83 | """Sets the downloader for this IE.""" 84 | self._downloader = downloader 85 | 86 | def _real_initialize(self): 87 | """Real initialization process. Redefine in subclasses.""" 88 | pass 89 | 90 | def _real_extract(self, url): 91 | """Real extraction process. Redefine in subclasses.""" 92 | pass 93 | 94 | 95 | class YoutubeIE(InfoExtractor): 96 | """Information extractor for youtube.com.""" 97 | 98 | _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' 99 | _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' 100 | _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' 101 | _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' 102 | _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' 103 | _NETRC_MACHINE = 'youtube' 104 | # Listed in order of quality 105 | _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] 106 | _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] 107 | _video_extensions = { 108 | '13': '3gp', 109 | '17': 'mp4', 110 | '18': 'mp4', 111 | '22': 'mp4', 112 | '37': 'mp4', 113 | '38': 'video', # You actually don't know if this will be MOV, AVI or whatever 114 | '43': 'webm', 115 | '44': 'webm', 116 | '45': 'webm', 117 | '46': 'webm', 118 | } 119 | _video_dimensions = { 120 | '5': '240x400', 121 | '6': '???', 122 | '13': '???', 123 | '17': '144x176', 124 | '18': '360x640', 125 | '22': '720x1280', 126 | '34': '360x640', 127 | '35': '480x854', 128 | '37': '1080x1920', 129 | '38': '3072x4096', 130 | '43': '360x640', 131 | '44': '480x854', 132 | '45': '720x1280', 133 | '46': '1080x1920', 134 | } 135 | IE_NAME = u'youtube' 136 | 137 | def report_lang(self): 138 | """Report attempt to set language.""" 139 | self._downloader.to_screen(u'[youtube] Setting language') 140 | 141 | def report_login(self): 142 | """Report attempt to log in.""" 143 | self._downloader.to_screen(u'[youtube] Logging in') 144 | 145 | def report_age_confirmation(self): 146 | """Report attempt to confirm age.""" 147 | self._downloader.to_screen(u'[youtube] Confirming age') 148 | 149 | def report_video_webpage_download(self, video_id): 150 | """Report attempt to download video webpage.""" 151 | self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) 152 | 153 | def report_video_info_webpage_download(self, video_id): 154 | """Report attempt to download video info webpage.""" 155 | self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) 156 | 157 | def report_video_subtitles_download(self, video_id): 158 | """Report attempt to download video info webpage.""" 159 | self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) 160 | 161 | def report_information_extraction(self, video_id): 162 | """Report attempt to extract video information.""" 163 | self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) 164 | 165 | def report_unavailable_format(self, video_id, format): 166 | """Report extracted video URL.""" 167 | self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) 168 | 169 | def report_rtmp_download(self): 170 | """Indicate the download will use the RTMP protocol.""" 171 | self._downloader.to_screen(u'[youtube] RTMP download detected') 172 | 173 | def _closed_captions_xml_to_srt(self, xml_string): 174 | srt = '' 175 | texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) 176 | # TODO parse xml instead of regex 177 | for n, (start, dur_tag, dur, caption) in enumerate(texts): 178 | if not dur: dur = '4' 179 | start = float(start) 180 | end = start + float(dur) 181 | start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) 182 | end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) 183 | caption = unescapeHTML(caption) 184 | caption = unescapeHTML(caption) # double cycle, intentional 185 | srt += str(n+1) + '\n' 186 | srt += start + ' --> ' + end + '\n' 187 | srt += caption + '\n\n' 188 | return srt 189 | 190 | def _print_formats(self, formats): 191 | print 'Available formats:' 192 | for x in formats: 193 | print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) 194 | 195 | def _real_initialize(self): 196 | if self._downloader is None: 197 | return 198 | 199 | username = None 200 | password = None 201 | downloader_params = self._downloader.params 202 | 203 | # Attempt to use provided username and password or .netrc data 204 | if downloader_params.get('username', None) is not None: 205 | username = downloader_params['username'] 206 | password = downloader_params['password'] 207 | elif downloader_params.get('usenetrc', False): 208 | try: 209 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) 210 | if info is not None: 211 | username = info[0] 212 | password = info[2] 213 | else: 214 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 215 | except (IOError, netrc.NetrcParseError), err: 216 | self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) 217 | return 218 | 219 | # Set language 220 | request = urllib2.Request(self._LANG_URL) 221 | try: 222 | self.report_lang() 223 | urllib2.urlopen(request).read() 224 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 225 | self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) 226 | return 227 | 228 | # No authentication to be performed 229 | if username is None: 230 | return 231 | 232 | # Log in 233 | login_form = { 234 | 'current_form': 'loginForm', 235 | 'next': '/', 236 | 'action_login': 'Log In', 237 | 'username': username, 238 | 'password': password, 239 | } 240 | request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) 241 | try: 242 | self.report_login() 243 | login_results = urllib2.urlopen(request).read() 244 | if re.search(r'(?i)]* name="loginForm"', login_results) is not None: 245 | self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') 246 | return 247 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 248 | self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) 249 | return 250 | 251 | # Confirm age 252 | age_form = { 253 | 'next_url': '/', 254 | 'action_confirm': 'Confirm', 255 | } 256 | request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form)) 257 | try: 258 | self.report_age_confirmation() 259 | age_results = urllib2.urlopen(request).read() 260 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 261 | self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) 262 | return 263 | 264 | def _real_extract(self, url): 265 | # Extract original video URL from URL with redirection, like age verification, using next_url parameter 266 | mobj = re.search(self._NEXT_URL_RE, url) 267 | if mobj: 268 | url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') 269 | 270 | # Extract video id from URL 271 | mobj = re.match(self._VALID_URL, url) 272 | if mobj is None: 273 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 274 | return 275 | video_id = mobj.group(2) 276 | 277 | # Get video webpage 278 | self.report_video_webpage_download(video_id) 279 | request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) 280 | try: 281 | video_webpage = urllib2.urlopen(request).read() 282 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 283 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 284 | return 285 | 286 | # Attempt to extract SWF player URL 287 | mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) 288 | if mobj is not None: 289 | player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) 290 | else: 291 | player_url = None 292 | 293 | # Get video info 294 | self.report_video_info_webpage_download(video_id) 295 | for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: 296 | video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' 297 | % (video_id, el_type)) 298 | request = urllib2.Request(video_info_url) 299 | try: 300 | video_info_webpage = urllib2.urlopen(request).read() 301 | video_info = parse_qs(video_info_webpage) 302 | if 'token' in video_info: 303 | break 304 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 305 | self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err)) 306 | return 307 | if 'token' not in video_info: 308 | if 'reason' in video_info: 309 | self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8')) 310 | else: 311 | self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') 312 | return 313 | 314 | # Check for "rental" videos 315 | if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: 316 | self._downloader.trouble(u'ERROR: "rental" videos not supported') 317 | return 318 | 319 | # Start extracting information 320 | self.report_information_extraction(video_id) 321 | 322 | # uploader 323 | if 'author' not in video_info: 324 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') 325 | return 326 | video_uploader = urllib.unquote_plus(video_info['author'][0]) 327 | 328 | # title 329 | if 'title' not in video_info: 330 | self._downloader.trouble(u'ERROR: unable to extract video title') 331 | return 332 | video_title = urllib.unquote_plus(video_info['title'][0]) 333 | video_title = video_title.decode('utf-8') 334 | 335 | # thumbnail image 336 | if 'thumbnail_url' not in video_info: 337 | self._downloader.trouble(u'WARNING: unable to extract video thumbnail') 338 | video_thumbnail = '' 339 | else: # don't panic if we can't find it 340 | video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0]) 341 | 342 | # upload date 343 | upload_date = u'NA' 344 | mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) 345 | if mobj is not None: 346 | upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) 347 | format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] 348 | for expression in format_expressions: 349 | try: 350 | upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') 351 | except: 352 | pass 353 | 354 | # description 355 | video_description = get_element_by_id("eow-description", video_webpage.decode('utf8')) 356 | if video_description: video_description = clean_html(video_description) 357 | else: video_description = '' 358 | 359 | # closed captions 360 | video_subtitles = None 361 | if self._downloader.params.get('writesubtitles', False): 362 | try: 363 | self.report_video_subtitles_download(video_id) 364 | request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) 365 | try: 366 | srt_list = urllib2.urlopen(request).read() 367 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 368 | raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) 369 | srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) 370 | if not srt_lang_list: 371 | raise Trouble(u'WARNING: video has no closed captions') 372 | if self._downloader.params.get('subtitleslang', False): 373 | srt_lang = self._downloader.params.get('subtitleslang') 374 | elif 'en' in srt_lang_list: 375 | srt_lang = 'en' 376 | else: 377 | srt_lang = srt_lang_list[0] 378 | if not srt_lang in srt_lang_list: 379 | raise Trouble(u'WARNING: no closed captions found in the specified language') 380 | request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) 381 | try: 382 | srt_xml = urllib2.urlopen(request).read() 383 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 384 | raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) 385 | video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) 386 | except Trouble as trouble: 387 | self._downloader.trouble(trouble[0]) 388 | 389 | # token 390 | video_token = urllib.unquote_plus(video_info['token'][0]) 391 | 392 | # Decide which formats to download 393 | req_format = self._downloader.params.get('format', None) 394 | 395 | if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): 396 | self.report_rtmp_download() 397 | video_url_list = [(None, video_info['conn'][0])] 398 | elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: 399 | url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') 400 | url_data = [parse_qs(uds) for uds in url_data_strs] 401 | url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) 402 | url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) 403 | 404 | format_limit = self._downloader.params.get('format_limit', None) 405 | available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats 406 | if format_limit is not None and format_limit in available_formats: 407 | format_list = available_formats[available_formats.index(format_limit):] 408 | else: 409 | format_list = available_formats 410 | existing_formats = [x for x in format_list if x in url_map] 411 | if len(existing_formats) == 0: 412 | self._downloader.trouble(u'ERROR: no known formats available for video') 413 | return 414 | if self._downloader.params.get('listformats', None): 415 | self._print_formats(existing_formats) 416 | return 417 | if req_format is None or req_format == 'best': 418 | video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality 419 | elif req_format == 'worst': 420 | video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality 421 | elif req_format in ('-1', 'all'): 422 | video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats 423 | else: 424 | # Specific formats. We pick the first in a slash-delimeted sequence. 425 | # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. 426 | req_formats = req_format.split('/') 427 | video_url_list = None 428 | for rf in req_formats: 429 | if rf in url_map: 430 | video_url_list = [(rf, url_map[rf])] 431 | break 432 | if video_url_list is None: 433 | self._downloader.trouble(u'ERROR: requested format not available') 434 | return 435 | else: 436 | self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') 437 | return 438 | 439 | results = [] 440 | for format_param, video_real_url in video_url_list: 441 | # Extension 442 | video_extension = self._video_extensions.get(format_param, 'flv') 443 | 444 | results.append({ 445 | 'id': video_id.decode('utf-8'), 446 | 'url': video_real_url.decode('utf-8'), 447 | 'uploader': video_uploader.decode('utf-8'), 448 | 'upload_date': upload_date, 449 | 'title': video_title, 450 | 'ext': video_extension.decode('utf-8'), 451 | 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 452 | 'thumbnail': video_thumbnail.decode('utf-8'), 453 | 'description': video_description, 454 | 'player_url': player_url, 455 | 'subtitles': video_subtitles 456 | }) 457 | return results 458 | 459 | 460 | class MetacafeIE(InfoExtractor): 461 | """Information Extractor for metacafe.com.""" 462 | 463 | _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' 464 | _DISCLAIMER = 'http://www.metacafe.com/family_filter/' 465 | _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' 466 | IE_NAME = u'metacafe' 467 | 468 | def __init__(self, downloader=None): 469 | InfoExtractor.__init__(self, downloader) 470 | 471 | def report_disclaimer(self): 472 | """Report disclaimer retrieval.""" 473 | self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') 474 | 475 | def report_age_confirmation(self): 476 | """Report attempt to confirm age.""" 477 | self._downloader.to_screen(u'[metacafe] Confirming age') 478 | 479 | def report_download_webpage(self, video_id): 480 | """Report webpage download.""" 481 | self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id) 482 | 483 | def report_extraction(self, video_id): 484 | """Report information extraction.""" 485 | self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id) 486 | 487 | def _real_initialize(self): 488 | # Retrieve disclaimer 489 | request = urllib2.Request(self._DISCLAIMER) 490 | try: 491 | self.report_disclaimer() 492 | disclaimer = urllib2.urlopen(request).read() 493 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 494 | self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err)) 495 | return 496 | 497 | # Confirm age 498 | disclaimer_form = { 499 | 'filters': '0', 500 | 'submit': "Continue - I'm over 18", 501 | } 502 | request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form)) 503 | try: 504 | self.report_age_confirmation() 505 | disclaimer = urllib2.urlopen(request).read() 506 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 507 | self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) 508 | return 509 | 510 | def _real_extract(self, url): 511 | # Extract id and simplified title from URL 512 | mobj = re.match(self._VALID_URL, url) 513 | if mobj is None: 514 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 515 | return 516 | 517 | video_id = mobj.group(1) 518 | 519 | # Check if video comes from YouTube 520 | mobj2 = re.match(r'^yt-(.*)$', video_id) 521 | if mobj2 is not None: 522 | self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) 523 | return 524 | 525 | # Retrieve video webpage to extract further information 526 | request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) 527 | try: 528 | self.report_download_webpage(video_id) 529 | webpage = urllib2.urlopen(request).read() 530 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 531 | self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) 532 | return 533 | 534 | # Extract URL, uploader and title from webpage 535 | self.report_extraction(video_id) 536 | mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) 537 | if mobj is not None: 538 | mediaURL = urllib.unquote(mobj.group(1)) 539 | video_extension = mediaURL[-3:] 540 | 541 | # Extract gdaKey if available 542 | mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) 543 | if mobj is None: 544 | video_url = mediaURL 545 | else: 546 | gdaKey = mobj.group(1) 547 | video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) 548 | else: 549 | mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) 550 | if mobj is None: 551 | self._downloader.trouble(u'ERROR: unable to extract media URL') 552 | return 553 | vardict = parse_qs(mobj.group(1)) 554 | if 'mediaData' not in vardict: 555 | self._downloader.trouble(u'ERROR: unable to extract media URL') 556 | return 557 | mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0]) 558 | if mobj is None: 559 | self._downloader.trouble(u'ERROR: unable to extract media URL') 560 | return 561 | mediaURL = mobj.group(1).replace('\\/', '/') 562 | video_extension = mediaURL[-3:] 563 | video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2)) 564 | 565 | mobj = re.search(r'(?im)(.*) - Video', webpage) 566 | if mobj is None: 567 | self._downloader.trouble(u'ERROR: unable to extract title') 568 | return 569 | video_title = mobj.group(1).decode('utf-8') 570 | 571 | mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) 572 | if mobj is None: 573 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') 574 | return 575 | video_uploader = mobj.group(1) 576 | 577 | return [{ 578 | 'id': video_id.decode('utf-8'), 579 | 'url': video_url.decode('utf-8'), 580 | 'uploader': video_uploader.decode('utf-8'), 581 | 'upload_date': u'NA', 582 | 'title': video_title, 583 | 'ext': video_extension.decode('utf-8'), 584 | 'format': u'NA', 585 | 'player_url': None, 586 | }] 587 | 588 | 589 | class DailymotionIE(InfoExtractor): 590 | """Information Extractor for Dailymotion""" 591 | 592 | _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' 593 | IE_NAME = u'dailymotion' 594 | 595 | def __init__(self, downloader=None): 596 | InfoExtractor.__init__(self, downloader) 597 | 598 | def report_download_webpage(self, video_id): 599 | """Report webpage download.""" 600 | self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) 601 | 602 | def report_extraction(self, video_id): 603 | """Report information extraction.""" 604 | self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) 605 | 606 | def _real_extract(self, url): 607 | # Extract id and simplified title from URL 608 | mobj = re.match(self._VALID_URL, url) 609 | if mobj is None: 610 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 611 | return 612 | 613 | video_id = mobj.group(1) 614 | 615 | video_extension = 'flv' 616 | 617 | # Retrieve video webpage to extract further information 618 | request = urllib2.Request(url) 619 | request.add_header('Cookie', 'family_filter=off') 620 | try: 621 | self.report_download_webpage(video_id) 622 | webpage = urllib2.urlopen(request).read() 623 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 624 | self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) 625 | return 626 | 627 | # Extract URL, uploader and title from webpage 628 | self.report_extraction(video_id) 629 | mobj = re.search(r'(?i)addVariable$\"sequence\"\s*,\s*\"([^\"]+?)\"$', webpage) 630 | if mobj is None: 631 | self._downloader.trouble(u'ERROR: unable to extract media URL') 632 | return 633 | sequence = urllib.unquote(mobj.group(1)) 634 | mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) 635 | if mobj is None: 636 | self._downloader.trouble(u'ERROR: unable to extract media URL') 637 | return 638 | mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') 639 | 640 | # if needed add http://www.dailymotion.com/ if relative URL 641 | 642 | video_url = mediaURL 643 | 644 | mobj = re.search(r'', webpage) 645 | if mobj is None: 646 | self._downloader.trouble(u'ERROR: unable to extract title') 647 | return 648 | video_title = unescapeHTML(mobj.group('title').decode('utf-8')) 649 | 650 | mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) 651 | if mobj is None: 652 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') 653 | return 654 | video_uploader = mobj.group(1) 655 | 656 | return [{ 657 | 'id': video_id.decode('utf-8'), 658 | 'url': video_url.decode('utf-8'), 659 | 'uploader': video_uploader.decode('utf-8'), 660 | 'upload_date': u'NA', 661 | 'title': video_title, 662 | 'ext': video_extension.decode('utf-8'), 663 | 'format': u'NA', 664 | 'player_url': None, 665 | }] 666 | 667 | 668 | class GoogleIE(InfoExtractor): 669 | """Information extractor for video.google.com.""" 670 | 671 | _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' 672 | IE_NAME = u'video.google' 673 | 674 | def __init__(self, downloader=None): 675 | InfoExtractor.__init__(self, downloader) 676 | 677 | def report_download_webpage(self, video_id): 678 | """Report webpage download.""" 679 | self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id) 680 | 681 | def report_extraction(self, video_id): 682 | """Report information extraction.""" 683 | self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id) 684 | 685 | def _real_extract(self, url): 686 | # Extract id from URL 687 | mobj = re.match(self._VALID_URL, url) 688 | if mobj is None: 689 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 690 | return 691 | 692 | video_id = mobj.group(1) 693 | 694 | video_extension = 'mp4' 695 | 696 | # Retrieve video webpage to extract further information 697 | request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id) 698 | try: 699 | self.report_download_webpage(video_id) 700 | webpage = urllib2.urlopen(request).read() 701 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 702 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 703 | return 704 | 705 | # Extract URL, uploader, and title from webpage 706 | self.report_extraction(video_id) 707 | mobj = re.search(r"download_url:'([^']+)'", webpage) 708 | if mobj is None: 709 | video_extension = 'flv' 710 | mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage) 711 | if mobj is None: 712 | self._downloader.trouble(u'ERROR: unable to extract media URL') 713 | return 714 | mediaURL = urllib.unquote(mobj.group(1)) 715 | mediaURL = mediaURL.replace('\\x3d', '\x3d') 716 | mediaURL = mediaURL.replace('\\x26', '\x26') 717 | 718 | video_url = mediaURL 719 | 720 | mobj = re.search(r'(.*)', webpage) 721 | if mobj is None: 722 | self._downloader.trouble(u'ERROR: unable to extract title') 723 | return 724 | video_title = mobj.group(1).decode('utf-8') 725 | 726 | # Extract video description 727 | mobj = re.search(r'([^<]*)', webpage) 728 | if mobj is None: 729 | self._downloader.trouble(u'ERROR: unable to extract video description') 730 | return 731 | video_description = mobj.group(1).decode('utf-8') 732 | if not video_description: 733 | video_description = 'No description available.' 734 | 735 | # Extract video thumbnail 736 | if self._downloader.params.get('forcethumbnail', False): 737 | request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id))) 738 | try: 739 | webpage = urllib2.urlopen(request).read() 740 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 741 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 742 | return 743 | mobj = re.search(r'', webpage) 744 | if mobj is None: 745 | self._downloader.trouble(u'ERROR: unable to extract video thumbnail') 746 | return 747 | video_thumbnail = mobj.group(1) 748 | else: # we need something to pass to process_info 749 | video_thumbnail = '' 750 | 751 | return [{ 752 | 'id': video_id.decode('utf-8'), 753 | 'url': video_url.decode('utf-8'), 754 | 'uploader': u'NA', 755 | 'upload_date': u'NA', 756 | 'title': video_title, 757 | 'ext': video_extension.decode('utf-8'), 758 | 'format': u'NA', 759 | 'player_url': None, 760 | }] 761 | 762 | 763 | class PhotobucketIE(InfoExtractor): 764 | """Information extractor for photobucket.com.""" 765 | 766 | _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' 767 | IE_NAME = u'photobucket' 768 | 769 | def __init__(self, downloader=None): 770 | InfoExtractor.__init__(self, downloader) 771 | 772 | def report_download_webpage(self, video_id): 773 | """Report webpage download.""" 774 | self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id) 775 | 776 | def report_extraction(self, video_id): 777 | """Report information extraction.""" 778 | self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id) 779 | 780 | def _real_extract(self, url): 781 | # Extract id from URL 782 | mobj = re.match(self._VALID_URL, url) 783 | if mobj is None: 784 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 785 | return 786 | 787 | video_id = mobj.group(1) 788 | 789 | video_extension = 'flv' 790 | 791 | # Retrieve video webpage to extract further information 792 | request = urllib2.Request(url) 793 | try: 794 | self.report_download_webpage(video_id) 795 | webpage = urllib2.urlopen(request).read() 796 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 797 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 798 | return 799 | 800 | # Extract URL, uploader, and title from webpage 801 | self.report_extraction(video_id) 802 | mobj = re.search(r'', webpage) 803 | if mobj is None: 804 | self._downloader.trouble(u'ERROR: unable to extract media URL') 805 | return 806 | mediaURL = urllib.unquote(mobj.group(1)) 807 | 808 | video_url = mediaURL 809 | 810 | mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) 811 | if mobj is None: 812 | self._downloader.trouble(u'ERROR: unable to extract title') 813 | return 814 | video_title = mobj.group(1).decode('utf-8') 815 | 816 | video_uploader = mobj.group(2).decode('utf-8') 817 | 818 | return [{ 819 | 'id': video_id.decode('utf-8'), 820 | 'url': video_url.decode('utf-8'), 821 | 'uploader': video_uploader, 822 | 'upload_date': u'NA', 823 | 'title': video_title, 824 | 'ext': video_extension.decode('utf-8'), 825 | 'format': u'NA', 826 | 'player_url': None, 827 | }] 828 | 829 | 830 | class YahooIE(InfoExtractor): 831 | """Information extractor for video.yahoo.com.""" 832 | 833 | # _VALID_URL matches all Yahoo! Video URLs 834 | # _VPAGE_URL matches only the extractable '/watch/' URLs 835 | _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' 836 | _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' 837 | IE_NAME = u'video.yahoo' 838 | 839 | def __init__(self, downloader=None): 840 | InfoExtractor.__init__(self, downloader) 841 | 842 | def report_download_webpage(self, video_id): 843 | """Report webpage download.""" 844 | self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id) 845 | 846 | def report_extraction(self, video_id): 847 | """Report information extraction.""" 848 | self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id) 849 | 850 | def _real_extract(self, url, new_video=True): 851 | # Extract ID from URL 852 | mobj = re.match(self._VALID_URL, url) 853 | if mobj is None: 854 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 855 | return 856 | 857 | video_id = mobj.group(2) 858 | video_extension = 'flv' 859 | 860 | # Rewrite valid but non-extractable URLs as 861 | # extractable English language /watch/ URLs 862 | if re.match(self._VPAGE_URL, url) is None: 863 | request = urllib2.Request(url) 864 | try: 865 | webpage = urllib2.urlopen(request).read() 866 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 867 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 868 | return 869 | 870 | mobj = re.search(r'$"id", "([0-9]+)"$;', webpage) 871 | if mobj is None: 872 | self._downloader.trouble(u'ERROR: Unable to extract id field') 873 | return 874 | yahoo_id = mobj.group(1) 875 | 876 | mobj = re.search(r'$"vid", "([0-9]+)"$;', webpage) 877 | if mobj is None: 878 | self._downloader.trouble(u'ERROR: Unable to extract vid field') 879 | return 880 | yahoo_vid = mobj.group(1) 881 | 882 | url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id) 883 | return self._real_extract(url, new_video=False) 884 | 885 | # Retrieve video webpage to extract further information 886 | request = urllib2.Request(url) 887 | try: 888 | self.report_download_webpage(video_id) 889 | webpage = urllib2.urlopen(request).read() 890 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 891 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 892 | return 893 | 894 | # Extract uploader and title from webpage 895 | self.report_extraction(video_id) 896 | mobj = re.search(r'', webpage) 897 | if mobj is None: 898 | self._downloader.trouble(u'ERROR: unable to extract video title') 899 | return 900 | video_title = mobj.group(1).decode('utf-8') 901 | 902 | mobj = re.search(r'

(.*)