├── .gitignore ├── chrome ├── source.crx ├── source │ ├── ikona128.png │ ├── manifest.json │ ├── background.html │ └── js │ │ └── inject.js └── source.pem ├── youtube_dl ├── __main__.py ├── PostProcessor.py ├── utils.py ├── __init__.py ├── FileDownloader.py └── InfoExtractors.py ├── install.sh ├── readme.md ├── index.html └── youtube-dl-server /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /chrome/source.crx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dz0ny/youtube-dl-server/HEAD/chrome/source.crx -------------------------------------------------------------------------------- /chrome/source/ikona128.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dz0ny/youtube-dl-server/HEAD/chrome/source/ikona128.png -------------------------------------------------------------------------------- /youtube_dl/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import __init__ 5 | 6 | if __name__ == '__main__': 7 | __init__.main() 8 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | GITHUB=https://github.com/dz0ny/youtube-dl-server 3 | 4 | rm -rf ~/youtube-dl-server 5 | mkdir ~/youtube-dl-server 6 | cd ~/youtube-dl-server 7 | curl -L $GITHUB/tarball/master -O ./youtube-dl-server.tar.gz 8 | tar -zxvf ./youtube-dl-server.tar.gz --strip 1 9 | rm ./youtube-dl-server.tar.gz 10 | rm ./install.sh 11 | chmod a+x ./youtube-dl-server -------------------------------------------------------------------------------- /chrome/source/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "YoutubeDL", 3 | "version": "0.0.4", 4 | "description": "Downloads youtube videos as mp3", 5 | "icons": {"128": "ikona128.png" }, 6 | "page_action": { 7 | "default_icon": "ikona128.png", // optional 8 | "default_title": "Download MP3" 9 | }, 10 | "background_page": "background.html", 11 | "permissions": [ 12 | "", 13 | "unlimitedStorage", 14 | "tabs" 15 | ] 16 | } 17 | -------------------------------------------------------------------------------- /chrome/source/background.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 28 | 29 | -------------------------------------------------------------------------------- /chrome/source.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | MIICeAIBADANBgkqhkiG9w0BAQEFAASCAmIwggJeAgEAAoGBAOqQoSNZfRe4Ra9cb 3 | ZrlJjcVVdl3DRMsQO54JRnlBU61AAoFT+Q4ozKsuXmLxK8IuQbLpmlMCeAqZKEKSp 4 | 91eJGaiYBBTvZyZUaxCjtf2d/T70mc/9aENrT257t9B7FoIksX9HycIHB5oBw/t+3 5 | FqLFl1a7uLGChg66IPJgPGujRAgMBAAECgYEAn87NmfHcIg7vmxvTqNY6BQlKJhDQ 6 | HaHm0xGT5WJ9DTSPxEP+PDTCK0I2UzMAW2gL9y9EPzUI/WqkiHskgCNecjbo5YIMm 7 | DAOlN4KPiZxqb1K4zJWrlkWUkju94ZrHXUGELesChpqONvj8ImVT/2KyIGFFfdLpW 8 | fYYNPqojEaB4kCQQD57dDZVfi3twlTrCWWQ/W+Iip7RxbqvcCx2XeeFZFckCn0ck/ 9 | BK+GFrA+kJNFpslSa0nKFvsdd5cNScXsXVT3DAkEA8ENFwW0DNOoADwqyPSC4kLk5 10 | PxbamBTFWeguQlloqilkRJA8KT0IprUPCpwx72ZwuIqs3L1hvna1mbJa+KRx2wJBA 11 | Ju5A8IHARtm3lbWEe1YlstK+nEpUCwe4uttdkx3X8TuxlVazDquHqxtEqnRjvFufa 12 | yhp12SCyKEQHkj3/Af2oUCQQDJeOAJrww9VuvtsR59u+6JDYk/qj5rwR8soVIJOUh 13 | XSJZYGEsamZ+ji7itQQVupwxm84K5J+XK/WiTFcjLL7p5AkBvYTCV/KMxruoNSC35 14 | heNWqYKQ8au+9amVG4UNVqTJEeCePbzNsFn4Q2QQA2Yxy2f/YIjP8sUmlfO9S8Uoh 15 | ovr 16 | -----END PRIVATE KEY----- 17 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # youtube-dl-server 2 | 3 | Allows you to download mp3 or mp4 from popular video sites. You can use bookmarklets or Chrome extension to initiate download. 4 | 5 | ## Usage 6 | 7 | $ youtube-dl-server 8 | 9 | ## Installation 10 | 11 | $ sudo apt-get install ffmpeg 12 | $ curl -L https://github.com/dz0ny/youtube-dl-server/raw/master/install.sh | bash 13 | 14 | # License 15 | 16 | (The MIT License) 17 | 18 | Copyright (c) 2011 Janez Troha 19 | 20 | Permission is hereby granted, free of charge, to any person obtaining 21 | a copy of this software and associated documentation files (the 22 | 'Software'), to deal in the Software without restriction, including 23 | without limitation the rights to use, copy, modify, merge, publish, 24 | distribute, sublicense, and/or sell copies of the Software, and to 25 | permit persons to whom the Software is furnished to do so, subject to 26 | the following conditions: 27 | 28 | The above copyright notice and this permission notice shall be 29 | included in all copies or substantial portions of the Software. 30 | 31 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 32 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 33 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 34 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 35 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 36 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 37 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /chrome/source/js/inject.js: -------------------------------------------------------------------------------- 1 | 2 | (function(window, undefined) { 3 | $("textarea").attr("placeholder", "Enter translation...") 4 | $("#q").replaceWith('') 5 | $('tr:first').html($('.search').remove().clone()); 6 | $(document).bind('keydown', 'ctrl+x', function() { 7 | $("form.edit_tolk_locale").submit(); 8 | }); 9 | $(document).bind('keydown', 'ctrl+right', function() { 10 | chrome.extension.sendRequest({'action' : 'navigate', "url" : "https://teambox.com" + $(".next_page").attr("href")}); 11 | }); 12 | $(document).bind('keydown', 'ctrl+left', function() { 13 | chrome.extension.sendRequest({'action' : 'navigate', "url" :"https://teambox.com" + $(".previous_page").attr("href")}); 14 | }); 15 | $('#head h1').append('Download translation'); 16 | $('.translations tr').append(''); 17 | $('nav:first').remove(); 18 | $(".button.google_translate").live("click", function(event) { 19 | event.preventDefault(); 20 | var translation = $(this).parent().parent(); 21 | var phrase = translation.find(".phrase").contents() 22 | .filter(function(){ return(this.nodeType == 3); }) 23 | .text() 24 | .replace(/^\W/g, "") 25 | .replace(/\s{2,}/g, "") 26 | var aT = new Translate(phrase); 27 | aT.search(function(result) { 28 | translation.find("textarea").val(result); 29 | }); 30 | return false; 31 | }); 32 | $(".button.copy").live("click", function(event) { 33 | event.preventDefault(); 34 | var translation = $(this).parent().parent(); 35 | var phrase = translation.find(".phrase").contents() 36 | .filter(function(){ return(this.nodeType == 3); }) 37 | .text() 38 | .replace(/^\W/g, "") 39 | .replace(/\s{2,}/g, "") 40 | translation.find("textarea").val(phrase); 41 | return false; 42 | }); 43 | $(".button.clean").live("click", function(event) { 44 | event.preventDefault(); 45 | var translation = $(this).parent().parent(); 46 | translation.find("textarea").val(""); 47 | return false; 48 | }); 49 | $(".button.download").live("click", function(event) { 50 | event.preventDefault(); 51 | var url = "https://teambox.com" + $("form").attr("action") + ".yml"; 52 | chrome.extension.sendRequest({'action' : 'download', "url" : url}); 53 | return false; 54 | }); 55 | var Translate = function(wordToTranslate){ 56 | this.rubyC = []; 57 | this.init(this,wordToTranslate) 58 | }; 59 | Translate.prototype = { 60 | init: function(self, wordToTranslate){ 61 | self.wordToTranslate = self.parse(wordToTranslate); 62 | self.locale = $("form").attr("action").replace("/tolk/locales/", ""); 63 | }, 64 | _req: function(url, callback){ 65 | chrome.extension.sendRequest({'action' : '_req', "url" : url}, callback); 66 | }, 67 | parse: function(toParse){ 68 | var self = this; 69 | var ret = ""; 70 | if (toParse.indexOf("---") == -1) { 71 | if (toParse.indexOf("{") !== -1) { 72 | ret = toParse.replace(/(%{.*?})/ig, function(text) { 73 | self.rubyC.push(text); 74 | return ""+(self.rubyC.length-1)+""; 75 | }) 76 | 77 | }else{ 78 | ret = toParse.replace(/\s<(.+)>(\d+)<\/.+>\s<\/esc>/ig, "<$1>$2").replace(/(\d+)<\/esc>|\s(\d+)\s<\/esc>/ig, function(text, index, indexa) { 79 | return self.rubyC[(index?index:indexa)]; 80 | }) 81 | } 82 | 83 | }else{ 84 | 85 | alert("This types of phrases aren't supported!") 86 | } 87 | return ret; 88 | 89 | }, 90 | search: function(callback){ 91 | var self = this; 92 | this._req("https://www.googleapis.com/language/translate/v2?key=AIzaSyCqbDDq_gkCnhpiSfKnOedtJmaBZMZPdp8&format=html&q="+encodeURIComponent(this.wordToTranslate)+"&source=en&target="+this.locale, function(json) { 93 | var word = self.parse( JSON.parse(json).data.translations[0].translatedText ); 94 | callback(word); 95 | }); 96 | } 97 | } 98 | 99 | })(window); -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | youtubedl-server 8 | 9 | 10 | 11 |
12 |
13 |

youtubedl-server

14 | 21 |
22 |

API

23 |
    24 |
  • 25 | http://localhost:9099/?music320aac=url Download and convert to mp3 26 |
  • 27 |
  • 28 | http://localhost:9099/?music320mp3=url Download and convert to aac 29 |
  • 30 |
  • 31 | http://localhost:9099/?musicbest=url Download and convert to best format 32 |
  • 33 |
  • 34 | http://localhost:9099/?video=url Download video 35 |
  • 36 |
37 |
38 | 42 |
43 |

License

44 |

(The MIT License) Copyright (c) 2011 Janez Troha 45 | Permission is hereby granted, free of charge, to any person obtaining 46 | a copy of this software and associated documentation files (the 'Software'), 47 | to deal in the Software without restriction, including without limitation 48 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 49 | and/or sell copies of the Software, and to permit persons to whom the Software 50 | is furnished to do so, subject to the following conditions: The above copyright 51 | notice and this permission notice shall be included in all copies or substantial 52 | portions of the Software. THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY 53 | OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 54 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 55 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 56 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 57 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 58 | DEALINGS IN THE SOFTWARE.

59 |
60 |
61 |
62 | 63 | 64 | -------------------------------------------------------------------------------- /youtube-dl-server: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Author: Janez Troha 5 | # License: MIT 6 | # NOTE: This script heavily relies on Ubuntu 7 | # TODO: * Queue for downloading, converting 8 | # * Support for other OSes 9 | 10 | import sys 11 | import os 12 | import threading 13 | import webbrowser 14 | import re 15 | import youtube_dl 16 | import subprocess 17 | import shlex 18 | 19 | from urlparse import urlparse 20 | from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer 21 | 22 | try: 23 | subprocess.call(['ffmpeg'], stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT) 24 | except (OSError, IOError): 25 | raise sys.exit(u'ERROR: "ffmpeg" could not be found, please install it!') 26 | 27 | try: 28 | import pynotify 29 | if not pynotify.init('Youtube-DL'): 30 | print 'there was a problem initializing the pynotify module' 31 | except: 32 | print "you don't seem to have pynotify installed" 33 | 34 | ##Download locations detection 35 | 36 | userhome = os.path.expanduser('~') 37 | try: 38 | 39 | # This is Ubuntu flavored thing 40 | 41 | userdirs = open(os.path.join(userhome, '.config/user-dirs.dirs'), 'r') 42 | paths = userdirs.read() 43 | regexVideo = re.compile('XDG_VIDEOS_DIR="(.+)"', re.MULTILINE | re.UNICODE) 44 | regexMusic = re.compile('XDG_MUSIC_DIR="(.+)"', re.MULTILINE | re.UNICODE) 45 | VideoPath = regexVideo.findall(paths)[0].replace('$HOME', userhome) 46 | MusicPath = regexMusic.findall(paths)[0].replace('$HOME', userhome) 47 | except IOError: 48 | VideoPath = os.path.join(userhome, 'Video') 49 | MusicPath = os.path.join(userhome, 'Music') 50 | 51 | 52 | def Notify(text): 53 | try: 54 | n = pynotify.Notification('Youtube-dl', text, 'dialog-info') 55 | n.set_timeout(1) 56 | n.show() 57 | except: 58 | print "you don't seem to have pynotify installed" 59 | 60 | 61 | class ServerThread(threading.Thread): 62 | 63 | def __init__(self, port): 64 | self.port = port 65 | threading.Thread.__init__(self) 66 | 67 | def run(self): 68 | self.server = StoppableHttpServer(('localhost', self.port), 69 | StoppableHttpServerRequestHandler) 70 | Notify("Server started on 'http://localhost:" + str(self.port) + "'") 71 | webbrowser.open('http://localhost:' + str(self.port)) 72 | self.server.serve_forever() 73 | 74 | def serverIsAlive(self): 75 | return self.server.shouldRun 76 | 77 | 78 | class StoppableHttpServer(HTTPServer): 79 | 80 | def serve_forever(self): 81 | self.shouldRun = True 82 | while self.shouldRun: 83 | self.handle_request() 84 | 85 | 86 | class StoppableHttpServerRequestHandler(BaseHTTPRequestHandler): 87 | 88 | # don't log stuff 89 | 90 | def log_message(self, format, *args): 91 | pass 92 | 93 | def do_GET(self): 94 | try: 95 | if self.path.endswith('.html') or self.path.endswith('.crx') or self.path == '/': 96 | 97 | # rewrite url 98 | 99 | if self.path == '/': 100 | self.path = '/index.html' 101 | 102 | f = open(sys.path[0] + self.path) 103 | 104 | if self.path.endswith('.crx'): 105 | self._sendHeader(200, 'application/x-chrome-extension') 106 | else: 107 | self._sendHeader(200, 'text/html') 108 | 109 | self.wfile.write(f.read()) 110 | f.close() 111 | return 112 | elif self.path == '/die': 113 | Notify('Server kill request received') 114 | self._sendHeader(200, 'text/plain') 115 | self.wfile.write('Server kill request received') 116 | self.server.shouldRun = False 117 | return 118 | elif self._tryResponse('music320mp3', Music320mp3): 119 | pass 120 | elif self._tryResponse('music320aac', Music320aac): 121 | pass 122 | elif self._tryResponse('musicbest', MusicBest): 123 | pass 124 | elif self._tryResponse('video', Video): 125 | pass 126 | else: 127 | raise IOError 128 | return 129 | except IOError: 130 | self.send_error(404, 'File Not Found: %s' % self.path) 131 | 132 | def _sendHeader(self, response_code, content_type): 133 | self.send_response(response_code) 134 | self.send_header('Content-type', content_type) 135 | self.end_headers() 136 | 137 | def _tryResponse(self, handler, postprocesor): 138 | if '?' + handler + '=' in self.path: 139 | self._sendHeader(200, 'text/plain') 140 | self.wfile.write('Downloading of "' + self.path + '" started') 141 | run = postprocesor(self.path) 142 | run.start() 143 | return True 144 | else: 145 | return False 146 | 147 | 148 | ##Music handler 149 | 150 | class Music320mp3(threading.Thread): 151 | 152 | def __init__(self, url): 153 | self.url = urlparse(url).query.replace('music320mp3=', '') 154 | self.command = '%s -t "%s" --extract-audio --audio-format mp3 --audio-quality 320k' \ 155 | % (sys.argv[0], self.url) 156 | Notify('Downloading of "' + self.url + '" started') 157 | threading.Thread.__init__(self) 158 | 159 | def run(self): 160 | subprocess.call(shlex.split(self.command)) 161 | 162 | 163 | class Music320aac(threading.Thread): 164 | 165 | def __init__(self, url): 166 | self.url = urlparse(url).query.replace('music320aac=', '') 167 | self.command = '%s -t "%s" --extract-audio --audio-format aac --audio-quality 320k' \ 168 | % (sys.argv[0], self.url) 169 | Notify('Downloading of "' + self.url + '" started') 170 | threading.Thread.__init__(self) 171 | 172 | def run(self): 173 | subprocess.call(shlex.split(self.command)) 174 | 175 | 176 | class MusicBest(threading.Thread): 177 | 178 | def __init__(self, url): 179 | self.url = urlparse(url).query.replace('musicbest=', '') 180 | self.command = '%s -t "%s" --extract-audio' % (sys.argv[0], self.url) 181 | Notify('Downloading of "' + self.url + '" started') 182 | threading.Thread.__init__(self) 183 | 184 | def run(self): 185 | subprocess.call(shlex.split(self.command)) 186 | 187 | class Video(threading.Thread): 188 | 189 | def __init__(self, url): 190 | self.url = urlparse(url).query.replace('video=', '') 191 | self.command = '%s -t "%s"' % (sys.argv[0], self.url) 192 | Notify('Downloading of "' + self.url + '" started') 193 | threading.Thread.__init__(self) 194 | 195 | def run(self): 196 | subprocess.call(shlex.split(self.command)) 197 | 198 | ## Main routine ## 199 | 200 | def main(): 201 | try: 202 | server = ServerThread(9099) 203 | server.start() 204 | except: 205 | sys.exit(0) 206 | 207 | 208 | if __name__ == '__main__': 209 | try: 210 | if len(sys.argv) > 1: 211 | youtube_dl.main() 212 | else: 213 | main() 214 | except: 215 | print 'Unknow error' 216 | -------------------------------------------------------------------------------- /youtube_dl/PostProcessor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import subprocess 6 | import sys 7 | import time 8 | 9 | from utils import * 10 | 11 | 12 | class PostProcessor(object): 13 | """Post Processor class. 14 | 15 | PostProcessor objects can be added to downloaders with their 16 | add_post_processor() method. When the downloader has finished a 17 | successful download, it will take its internal chain of PostProcessors 18 | and start calling the run() method on each one of them, first with 19 | an initial argument and then with the returned value of the previous 20 | PostProcessor. 21 | 22 | The chain will be stopped if one of them ever returns None or the end 23 | of the chain is reached. 24 | 25 | PostProcessor objects follow a "mutual registration" process similar 26 | to InfoExtractor objects. 27 | """ 28 | 29 | _downloader = None 30 | 31 | def __init__(self, downloader=None): 32 | self._downloader = downloader 33 | 34 | def set_downloader(self, downloader): 35 | """Sets the downloader for this PP.""" 36 | self._downloader = downloader 37 | 38 | def run(self, information): 39 | """Run the PostProcessor. 40 | 41 | The "information" argument is a dictionary like the ones 42 | composed by InfoExtractors. The only difference is that this 43 | one has an extra field called "filepath" that points to the 44 | downloaded file. 45 | 46 | When this method returns None, the postprocessing chain is 47 | stopped. However, this method may return an information 48 | dictionary that will be passed to the next postprocessing 49 | object in the chain. It can be the one it received after 50 | changing some fields. 51 | 52 | In addition, this method may raise a PostProcessingError 53 | exception that will be taken into account by the downloader 54 | it was called from. 55 | """ 56 | return information # by default, do nothing 57 | 58 | class AudioConversionError(BaseException): 59 | def __init__(self, message): 60 | self.message = message 61 | 62 | class FFmpegExtractAudioPP(PostProcessor): 63 | def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False): 64 | PostProcessor.__init__(self, downloader) 65 | if preferredcodec is None: 66 | preferredcodec = 'best' 67 | self._preferredcodec = preferredcodec 68 | self._preferredquality = preferredquality 69 | self._keepvideo = keepvideo 70 | self._exes = self.detect_executables() 71 | 72 | @staticmethod 73 | def detect_executables(): 74 | available = {'avprobe' : False, 'avconv' : False, 'ffmpeg' : False, 'ffprobe' : False} 75 | for path in os.environ["PATH"].split(os.pathsep): 76 | for program in available.keys(): 77 | exe_file = os.path.join(path, program) 78 | if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK): 79 | available[program] = exe_file 80 | return available 81 | 82 | def get_audio_codec(self, path): 83 | if not self._exes['ffprobe'] and not self._exes['avprobe']: return None 84 | try: 85 | cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', '--', encodeFilename(path)] 86 | handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE) 87 | output = handle.communicate()[0] 88 | if handle.wait() != 0: 89 | return None 90 | except (IOError, OSError): 91 | return None 92 | audio_codec = None 93 | for line in output.split('\n'): 94 | if line.startswith('codec_name='): 95 | audio_codec = line.split('=')[1].strip() 96 | elif line.strip() == 'codec_type=audio' and audio_codec is not None: 97 | return audio_codec 98 | return None 99 | 100 | def run_ffmpeg(self, path, out_path, codec, more_opts): 101 | if not self._exes['ffmpeg'] and not self._exes['avconv']: 102 | raise AudioConversionError('ffmpeg or avconv not found. Please install one.') 103 | if codec is None: 104 | acodec_opts = [] 105 | else: 106 | acodec_opts = ['-acodec', codec] 107 | cmd = ([self._exes['avconv'] or self._exes['ffmpeg'], '-y', '-i', encodeFilename(path), '-vn'] 108 | + acodec_opts + more_opts + 109 | ['--', encodeFilename(out_path)]) 110 | p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 111 | stdout,stderr = p.communicate() 112 | if p.returncode != 0: 113 | msg = stderr.strip().split('\n')[-1] 114 | raise AudioConversionError(msg) 115 | 116 | def run(self, information): 117 | path = information['filepath'] 118 | 119 | filecodec = self.get_audio_codec(path) 120 | if filecodec is None: 121 | self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe') 122 | return None 123 | 124 | more_opts = [] 125 | if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'): 126 | if self._preferredcodec == 'm4a' and filecodec == 'aac': 127 | # Lossless, but in another container 128 | acodec = 'copy' 129 | extension = self._preferredcodec 130 | more_opts = [self._exes['avconv'] and '-bsf:a' or '-absf', 'aac_adtstoasc'] 131 | elif filecodec in ['aac', 'mp3', 'vorbis']: 132 | # Lossless if possible 133 | acodec = 'copy' 134 | extension = filecodec 135 | if filecodec == 'aac': 136 | more_opts = ['-f', 'adts'] 137 | if filecodec == 'vorbis': 138 | extension = 'ogg' 139 | else: 140 | # MP3 otherwise. 141 | acodec = 'libmp3lame' 142 | extension = 'mp3' 143 | more_opts = [] 144 | if self._preferredquality is not None: 145 | more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality] 146 | else: 147 | # We convert the audio (lossy) 148 | acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec] 149 | extension = self._preferredcodec 150 | more_opts = [] 151 | if self._preferredquality is not None: 152 | more_opts += [self._exes['avconv'] and '-b:a' or '-ab', self._preferredquality] 153 | if self._preferredcodec == 'aac': 154 | more_opts += ['-f', 'adts'] 155 | if self._preferredcodec == 'm4a': 156 | more_opts += [self._exes['avconv'] and '-bsf:a' or '-absf', 'aac_adtstoasc'] 157 | if self._preferredcodec == 'vorbis': 158 | extension = 'ogg' 159 | if self._preferredcodec == 'wav': 160 | extension = 'wav' 161 | more_opts += ['-f', 'wav'] 162 | 163 | prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups 164 | new_path = prefix + sep + extension 165 | self._downloader.to_screen(u'[' + (self._exes['avconv'] and 'avconv' or 'ffmpeg') + '] Destination: ' + new_path) 166 | try: 167 | self.run_ffmpeg(path, new_path, acodec, more_opts) 168 | except: 169 | etype,e,tb = sys.exc_info() 170 | if isinstance(e, AudioConversionError): 171 | self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message) 172 | else: 173 | self._downloader.to_stderr(u'ERROR: error running ' + (self._exes['avconv'] and 'avconv' or 'ffmpeg')) 174 | return None 175 | 176 | # Try to update the date time for extracted audio file. 177 | if information.get('filetime') is not None: 178 | try: 179 | os.utime(encodeFilename(new_path), (time.time(), information['filetime'])) 180 | except: 181 | self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file') 182 | 183 | if not self._keepvideo: 184 | try: 185 | os.remove(encodeFilename(path)) 186 | except (IOError, OSError): 187 | self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file') 188 | return None 189 | 190 | information['filepath'] = new_path 191 | return information 192 | -------------------------------------------------------------------------------- /youtube_dl/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import gzip 5 | import htmlentitydefs 6 | import HTMLParser 7 | import locale 8 | import os 9 | import re 10 | import sys 11 | import zlib 12 | import urllib2 13 | import email.utils 14 | import json 15 | 16 | try: 17 | import cStringIO as StringIO 18 | except ImportError: 19 | import StringIO 20 | 21 | std_headers = { 22 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', 23 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 24 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 25 | 'Accept-Encoding': 'gzip, deflate', 26 | 'Accept-Language': 'en-us,en;q=0.5', 27 | } 28 | 29 | def preferredencoding(): 30 | """Get preferred encoding. 31 | 32 | Returns the best encoding scheme for the system, based on 33 | locale.getpreferredencoding() and some further tweaks. 34 | """ 35 | def yield_preferredencoding(): 36 | try: 37 | pref = locale.getpreferredencoding() 38 | u'TEST'.encode(pref) 39 | except: 40 | pref = 'UTF-8' 41 | while True: 42 | yield pref 43 | return yield_preferredencoding().next() 44 | 45 | 46 | def htmlentity_transform(matchobj): 47 | """Transforms an HTML entity to a Unicode character. 48 | 49 | This function receives a match object and is intended to be used with 50 | the re.sub() function. 51 | """ 52 | entity = matchobj.group(1) 53 | 54 | # Known non-numeric HTML entity 55 | if entity in htmlentitydefs.name2codepoint: 56 | return unichr(htmlentitydefs.name2codepoint[entity]) 57 | 58 | # Unicode character 59 | mobj = re.match(ur'(?u)#(x?\d+)', entity) 60 | if mobj is not None: 61 | numstr = mobj.group(1) 62 | if numstr.startswith(u'x'): 63 | base = 16 64 | numstr = u'0%s' % numstr 65 | else: 66 | base = 10 67 | return unichr(long(numstr, base)) 68 | 69 | # Unknown entity in name, return its literal representation 70 | return (u'&%s;' % entity) 71 | 72 | HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix 73 | class IDParser(HTMLParser.HTMLParser): 74 | """Modified HTMLParser that isolates a tag with the specified id""" 75 | def __init__(self, id): 76 | self.id = id 77 | self.result = None 78 | self.started = False 79 | self.depth = {} 80 | self.html = None 81 | self.watch_startpos = False 82 | self.error_count = 0 83 | HTMLParser.HTMLParser.__init__(self) 84 | 85 | def error(self, message): 86 | print >> sys.stderr, self.getpos() 87 | if self.error_count > 10 or self.started: 88 | raise HTMLParser.HTMLParseError(message, self.getpos()) 89 | self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line 90 | self.error_count += 1 91 | self.goahead(1) 92 | 93 | def loads(self, html): 94 | self.html = html 95 | self.feed(html) 96 | self.close() 97 | 98 | def handle_starttag(self, tag, attrs): 99 | attrs = dict(attrs) 100 | if self.started: 101 | self.find_startpos(None) 102 | if 'id' in attrs and attrs['id'] == self.id: 103 | self.result = [tag] 104 | self.started = True 105 | self.watch_startpos = True 106 | if self.started: 107 | if not tag in self.depth: self.depth[tag] = 0 108 | self.depth[tag] += 1 109 | 110 | def handle_endtag(self, tag): 111 | if self.started: 112 | if tag in self.depth: self.depth[tag] -= 1 113 | if self.depth[self.result[0]] == 0: 114 | self.started = False 115 | self.result.append(self.getpos()) 116 | 117 | def find_startpos(self, x): 118 | """Needed to put the start position of the result (self.result[1]) 119 | after the opening tag with the requested id""" 120 | if self.watch_startpos: 121 | self.watch_startpos = False 122 | self.result.append(self.getpos()) 123 | handle_entityref = handle_charref = handle_data = handle_comment = \ 124 | handle_decl = handle_pi = unknown_decl = find_startpos 125 | 126 | def get_result(self): 127 | if self.result == None: return None 128 | if len(self.result) != 3: return None 129 | lines = self.html.split('\n') 130 | lines = lines[self.result[1][0]-1:self.result[2][0]] 131 | lines[0] = lines[0][self.result[1][1]:] 132 | if len(lines) == 1: 133 | lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] 134 | lines[-1] = lines[-1][:self.result[2][1]] 135 | return '\n'.join(lines).strip() 136 | 137 | def get_element_by_id(id, html): 138 | """Return the content of the tag with the specified id in the passed HTML document""" 139 | parser = IDParser(id) 140 | try: 141 | parser.loads(html) 142 | except HTMLParser.HTMLParseError: 143 | pass 144 | return parser.get_result() 145 | 146 | 147 | def clean_html(html): 148 | """Clean an HTML snippet into a readable string""" 149 | # Newline vs
150 | html = html.replace('\n', ' ') 151 | html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) 152 | # Strip html tags 153 | html = re.sub('<.*?>', '', html) 154 | # Replace html entities 155 | html = unescapeHTML(html) 156 | return html 157 | 158 | 159 | def sanitize_open(filename, open_mode): 160 | """Try to open the given filename, and slightly tweak it if this fails. 161 | 162 | Attempts to open the given filename. If this fails, it tries to change 163 | the filename slightly, step by step, until it's either able to open it 164 | or it fails and raises a final exception, like the standard open() 165 | function. 166 | 167 | It returns the tuple (stream, definitive_file_name). 168 | """ 169 | try: 170 | if filename == u'-': 171 | if sys.platform == 'win32': 172 | import msvcrt 173 | msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) 174 | return (sys.stdout, filename) 175 | stream = open(encodeFilename(filename), open_mode) 176 | return (stream, filename) 177 | except (IOError, OSError), err: 178 | # In case of error, try to remove win32 forbidden chars 179 | filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) 180 | 181 | # An exception here should be caught in the caller 182 | stream = open(encodeFilename(filename), open_mode) 183 | return (stream, filename) 184 | 185 | 186 | def timeconvert(timestr): 187 | """Convert RFC 2822 defined time string into system timestamp""" 188 | timestamp = None 189 | timetuple = email.utils.parsedate_tz(timestr) 190 | if timetuple is not None: 191 | timestamp = email.utils.mktime_tz(timetuple) 192 | return timestamp 193 | 194 | def sanitize_filename(s): 195 | """Sanitizes a string so it could be used as part of a filename.""" 196 | def replace_insane(char): 197 | if char in u' .\\/|?*<>:"' or ord(char) < 32: 198 | return '_' 199 | return char 200 | return u''.join(map(replace_insane, s)).strip('_') 201 | 202 | def orderedSet(iterable): 203 | """ Remove all duplicates from the input iterable """ 204 | res = [] 205 | for el in iterable: 206 | if el not in res: 207 | res.append(el) 208 | return res 209 | 210 | def unescapeHTML(s): 211 | """ 212 | @param s a string (of type unicode) 213 | """ 214 | assert type(s) == type(u'') 215 | 216 | result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) 217 | return result 218 | 219 | def encodeFilename(s): 220 | """ 221 | @param s The name of the file (of type unicode) 222 | """ 223 | 224 | assert type(s) == type(u'') 225 | 226 | if sys.platform == 'win32' and sys.getwindowsversion().major >= 5: 227 | # Pass u'' directly to use Unicode APIs on Windows 2000 and up 228 | # (Detecting Windows NT 4 is tricky because 'major >= 4' would 229 | # match Windows 9x series as well. Besides, NT 4 is obsolete.) 230 | return s 231 | else: 232 | return s.encode(sys.getfilesystemencoding(), 'ignore') 233 | 234 | class DownloadError(Exception): 235 | """Download Error exception. 236 | 237 | This exception may be thrown by FileDownloader objects if they are not 238 | configured to continue on errors. They will contain the appropriate 239 | error message. 240 | """ 241 | pass 242 | 243 | 244 | class SameFileError(Exception): 245 | """Same File exception. 246 | 247 | This exception will be thrown by FileDownloader objects if they detect 248 | multiple files would have to be downloaded to the same file on disk. 249 | """ 250 | pass 251 | 252 | 253 | class PostProcessingError(Exception): 254 | """Post Processing exception. 255 | 256 | This exception may be raised by PostProcessor's .run() method to 257 | indicate an error in the postprocessing task. 258 | """ 259 | pass 260 | 261 | class MaxDownloadsReached(Exception): 262 | """ --max-downloads limit has been reached. """ 263 | pass 264 | 265 | 266 | class UnavailableVideoError(Exception): 267 | """Unavailable Format exception. 268 | 269 | This exception will be thrown when a video is requested 270 | in a format that is not available for that video. 271 | """ 272 | pass 273 | 274 | 275 | class ContentTooShortError(Exception): 276 | """Content Too Short exception. 277 | 278 | This exception may be raised by FileDownloader objects when a file they 279 | download is too small for what the server announced first, indicating 280 | the connection was probably interrupted. 281 | """ 282 | # Both in bytes 283 | downloaded = None 284 | expected = None 285 | 286 | def __init__(self, downloaded, expected): 287 | self.downloaded = downloaded 288 | self.expected = expected 289 | 290 | 291 | class Trouble(Exception): 292 | """Trouble helper exception 293 | 294 | This is an exception to be handled with 295 | FileDownloader.trouble 296 | """ 297 | 298 | class YoutubeDLHandler(urllib2.HTTPHandler): 299 | """Handler for HTTP requests and responses. 300 | 301 | This class, when installed with an OpenerDirector, automatically adds 302 | the standard headers to every HTTP request and handles gzipped and 303 | deflated responses from web servers. If compression is to be avoided in 304 | a particular request, the original request in the program code only has 305 | to include the HTTP header "Youtubedl-No-Compression", which will be 306 | removed before making the real request. 307 | 308 | Part of this code was copied from: 309 | 310 | http://techknack.net/python-urllib2-handlers/ 311 | 312 | Andrew Rowls, the author of that code, agreed to release it to the 313 | public domain. 314 | """ 315 | 316 | @staticmethod 317 | def deflate(data): 318 | try: 319 | return zlib.decompress(data, -zlib.MAX_WBITS) 320 | except zlib.error: 321 | return zlib.decompress(data) 322 | 323 | @staticmethod 324 | def addinfourl_wrapper(stream, headers, url, code): 325 | if hasattr(urllib2.addinfourl, 'getcode'): 326 | return urllib2.addinfourl(stream, headers, url, code) 327 | ret = urllib2.addinfourl(stream, headers, url) 328 | ret.code = code 329 | return ret 330 | 331 | def http_request(self, req): 332 | for h in std_headers: 333 | if h in req.headers: 334 | del req.headers[h] 335 | req.add_header(h, std_headers[h]) 336 | if 'Youtubedl-no-compression' in req.headers: 337 | if 'Accept-encoding' in req.headers: 338 | del req.headers['Accept-encoding'] 339 | del req.headers['Youtubedl-no-compression'] 340 | return req 341 | 342 | def http_response(self, req, resp): 343 | old_resp = resp 344 | # gzip 345 | if resp.headers.get('Content-encoding', '') == 'gzip': 346 | gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') 347 | resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) 348 | resp.msg = old_resp.msg 349 | # deflate 350 | if resp.headers.get('Content-encoding', '') == 'deflate': 351 | gz = StringIO.StringIO(self.deflate(resp.read())) 352 | resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) 353 | resp.msg = old_resp.msg 354 | return resp 355 | -------------------------------------------------------------------------------- /youtube_dl/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | __authors__ = ( 5 | 'Ricardo Garcia Gonzalez', 6 | 'Danny Colligan', 7 | 'Benjamin Johnson', 8 | 'Vasyl\' Vavrychuk', 9 | 'Witold Baryluk', 10 | 'Paweł Paprota', 11 | 'Gergely Imreh', 12 | 'Rogério Brito', 13 | 'Philipp Hagemeister', 14 | 'Sören Schulze', 15 | 'Kevin Ngo', 16 | 'Ori Avtalion', 17 | 'shizeeg', 18 | 'Filippo Valsorda', 19 | ) 20 | 21 | __license__ = 'Public Domain' 22 | __version__ = '2012.02.27' 23 | 24 | UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl' 25 | UPDATE_URL_VERSION = 'https://raw.github.com/rg3/youtube-dl/master/LATEST_VERSION' 26 | UPDATE_URL_EXE = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl.exe' 27 | 28 | 29 | import cookielib 30 | import getpass 31 | import optparse 32 | import os 33 | import re 34 | import shlex 35 | import socket 36 | import subprocess 37 | import sys 38 | import urllib2 39 | import warnings 40 | 41 | from utils import * 42 | from FileDownloader import * 43 | from InfoExtractors import * 44 | from PostProcessor import * 45 | 46 | def updateSelf(downloader, filename): 47 | ''' Update the program file with the latest version from the repository ''' 48 | # Note: downloader only used for options 49 | 50 | if not os.access(filename, os.W_OK): 51 | sys.exit('ERROR: no write permissions on %s' % filename) 52 | 53 | downloader.to_screen(u'Updating to latest version...') 54 | 55 | urlv = urllib2.urlopen(UPDATE_URL_VERSION) 56 | newversion = urlv.read().strip() 57 | if newversion == __version__: 58 | downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')') 59 | return 60 | urlv.close() 61 | 62 | if hasattr(sys, "frozen"): #py2exe 63 | exe = os.path.abspath(filename) 64 | directory = os.path.dirname(exe) 65 | if not os.access(directory, os.W_OK): 66 | sys.exit('ERROR: no write permissions on %s' % directory) 67 | 68 | try: 69 | urlh = urllib2.urlopen(UPDATE_URL_EXE) 70 | newcontent = urlh.read() 71 | urlh.close() 72 | with open(exe + '.new', 'wb') as outf: 73 | outf.write(newcontent) 74 | except (IOError, OSError), err: 75 | sys.exit('ERROR: unable to download latest version') 76 | 77 | try: 78 | bat = os.path.join(directory, 'youtube-dl-updater.bat') 79 | b = open(bat, 'w') 80 | 81 | print >> b, """ 82 | echo Updating youtube-dl... 83 | ping 127.0.0.1 -n 5 -w 1000 > NUL 84 | move /Y "%s.new" "%s" 85 | del "%s" 86 | """ %(exe, exe, bat) 87 | 88 | b.close() 89 | 90 | os.startfile(bat) 91 | except (IOError, OSError), err: 92 | sys.exit('ERROR: unable to overwrite current version') 93 | 94 | else: 95 | try: 96 | urlh = urllib2.urlopen(UPDATE_URL) 97 | newcontent = urlh.read() 98 | urlh.close() 99 | except (IOError, OSError), err: 100 | sys.exit('ERROR: unable to download latest version') 101 | 102 | try: 103 | with open(filename, 'wb') as outf: 104 | outf.write(newcontent) 105 | except (IOError, OSError), err: 106 | sys.exit('ERROR: unable to overwrite current version') 107 | 108 | downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.') 109 | 110 | def parseOpts(): 111 | def _readOptions(filename_bytes): 112 | try: 113 | optionf = open(filename_bytes) 114 | except IOError: 115 | return [] # silently skip if file is not present 116 | try: 117 | res = [] 118 | for l in optionf: 119 | res += shlex.split(l, comments=True) 120 | finally: 121 | optionf.close() 122 | return res 123 | 124 | def _format_option_string(option): 125 | ''' ('-o', '--option') -> -o, --format METAVAR''' 126 | 127 | opts = [] 128 | 129 | if option._short_opts: opts.append(option._short_opts[0]) 130 | if option._long_opts: opts.append(option._long_opts[0]) 131 | if len(opts) > 1: opts.insert(1, ', ') 132 | 133 | if option.takes_value(): opts.append(' %s' % option.metavar) 134 | 135 | return "".join(opts) 136 | 137 | def _find_term_columns(): 138 | columns = os.environ.get('COLUMNS', None) 139 | if columns: 140 | return int(columns) 141 | 142 | try: 143 | sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 144 | out,err = sp.communicate() 145 | return int(out.split()[1]) 146 | except: 147 | pass 148 | return None 149 | 150 | max_width = 80 151 | max_help_position = 80 152 | 153 | # No need to wrap help messages if we're on a wide console 154 | columns = _find_term_columns() 155 | if columns: max_width = columns 156 | 157 | fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) 158 | fmt.format_option_strings = _format_option_string 159 | 160 | kw = { 161 | 'version' : __version__, 162 | 'formatter' : fmt, 163 | 'usage' : '%prog [options] url [url...]', 164 | 'conflict_handler' : 'resolve', 165 | } 166 | 167 | parser = optparse.OptionParser(**kw) 168 | 169 | # option groups 170 | general = optparse.OptionGroup(parser, 'General Options') 171 | selection = optparse.OptionGroup(parser, 'Video Selection') 172 | authentication = optparse.OptionGroup(parser, 'Authentication Options') 173 | video_format = optparse.OptionGroup(parser, 'Video Format Options') 174 | postproc = optparse.OptionGroup(parser, 'Post-processing Options') 175 | filesystem = optparse.OptionGroup(parser, 'Filesystem Options') 176 | verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') 177 | 178 | general.add_option('-h', '--help', 179 | action='help', help='print this help text and exit') 180 | general.add_option('-v', '--version', 181 | action='version', help='print program version and exit') 182 | general.add_option('-U', '--update', 183 | action='store_true', dest='update_self', help='update this program to latest version') 184 | general.add_option('-i', '--ignore-errors', 185 | action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) 186 | general.add_option('-r', '--rate-limit', 187 | dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)') 188 | general.add_option('-R', '--retries', 189 | dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10) 190 | general.add_option('--dump-user-agent', 191 | action='store_true', dest='dump_user_agent', 192 | help='display the current browser identification', default=False) 193 | general.add_option('--list-extractors', 194 | action='store_true', dest='list_extractors', 195 | help='List all supported extractors and the URLs they would handle', default=False) 196 | 197 | selection.add_option('--playlist-start', 198 | dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) 199 | selection.add_option('--playlist-end', 200 | dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1) 201 | selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)') 202 | selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)') 203 | selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None) 204 | 205 | authentication.add_option('-u', '--username', 206 | dest='username', metavar='USERNAME', help='account username') 207 | authentication.add_option('-p', '--password', 208 | dest='password', metavar='PASSWORD', help='account password') 209 | authentication.add_option('-n', '--netrc', 210 | action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False) 211 | 212 | 213 | video_format.add_option('-f', '--format', 214 | action='store', dest='format', metavar='FORMAT', help='video format code') 215 | video_format.add_option('--all-formats', 216 | action='store_const', dest='format', help='download all available video formats', const='all') 217 | video_format.add_option('--prefer-free-formats', 218 | action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested') 219 | video_format.add_option('--max-quality', 220 | action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') 221 | video_format.add_option('-F', '--list-formats', 222 | action='store_true', dest='listformats', help='list all available formats (currently youtube only)') 223 | video_format.add_option('--write-srt', 224 | action='store_true', dest='writesubtitles', 225 | help='write video closed captions to a .srt file (currently youtube only)', default=False) 226 | video_format.add_option('--srt-lang', 227 | action='store', dest='subtitleslang', metavar='LANG', 228 | help='language of the closed captions to download (optional) use IETF language tags like \'en\'') 229 | 230 | 231 | verbosity.add_option('-q', '--quiet', 232 | action='store_true', dest='quiet', help='activates quiet mode', default=False) 233 | verbosity.add_option('-s', '--simulate', 234 | action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False) 235 | verbosity.add_option('--skip-download', 236 | action='store_true', dest='skip_download', help='do not download the video', default=False) 237 | verbosity.add_option('-g', '--get-url', 238 | action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False) 239 | verbosity.add_option('-e', '--get-title', 240 | action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False) 241 | verbosity.add_option('--get-thumbnail', 242 | action='store_true', dest='getthumbnail', 243 | help='simulate, quiet but print thumbnail URL', default=False) 244 | verbosity.add_option('--get-description', 245 | action='store_true', dest='getdescription', 246 | help='simulate, quiet but print video description', default=False) 247 | verbosity.add_option('--get-filename', 248 | action='store_true', dest='getfilename', 249 | help='simulate, quiet but print output filename', default=False) 250 | verbosity.add_option('--get-format', 251 | action='store_true', dest='getformat', 252 | help='simulate, quiet but print output format', default=False) 253 | verbosity.add_option('--no-progress', 254 | action='store_true', dest='noprogress', help='do not print progress bar', default=False) 255 | verbosity.add_option('--console-title', 256 | action='store_true', dest='consoletitle', 257 | help='display progress in console titlebar', default=False) 258 | verbosity.add_option('-v', '--verbose', 259 | action='store_true', dest='verbose', help='print various debugging information', default=False) 260 | 261 | 262 | filesystem.add_option('-t', '--title', 263 | action='store_true', dest='usetitle', help='use title in file name', default=False) 264 | filesystem.add_option('-l', '--literal', 265 | action='store_true', dest='useliteral', help='use literal title in file name', default=False) 266 | filesystem.add_option('-A', '--auto-number', 267 | action='store_true', dest='autonumber', 268 | help='number downloaded files starting from 00000', default=False) 269 | filesystem.add_option('-o', '--output', 270 | dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.') 271 | filesystem.add_option('-a', '--batch-file', 272 | dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') 273 | filesystem.add_option('-w', '--no-overwrites', 274 | action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) 275 | filesystem.add_option('-c', '--continue', 276 | action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True) 277 | filesystem.add_option('--no-continue', 278 | action='store_false', dest='continue_dl', 279 | help='do not resume partially downloaded files (restart from beginning)') 280 | filesystem.add_option('--cookies', 281 | dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in') 282 | filesystem.add_option('--no-part', 283 | action='store_true', dest='nopart', help='do not use .part files', default=False) 284 | filesystem.add_option('--no-mtime', 285 | action='store_false', dest='updatetime', 286 | help='do not use the Last-modified header to set the file modification time', default=True) 287 | filesystem.add_option('--write-description', 288 | action='store_true', dest='writedescription', 289 | help='write video description to a .description file', default=False) 290 | filesystem.add_option('--write-info-json', 291 | action='store_true', dest='writeinfojson', 292 | help='write video metadata to a .info.json file', default=False) 293 | 294 | 295 | postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False, 296 | help='convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') 297 | postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best', 298 | help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default') 299 | postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K', 300 | help='ffmpeg/avconv audio bitrate specification, 128k by default') 301 | postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False, 302 | help='keeps the video file on disk after the post-processing; the video is erased by default') 303 | 304 | 305 | parser.add_option_group(general) 306 | parser.add_option_group(selection) 307 | parser.add_option_group(filesystem) 308 | parser.add_option_group(verbosity) 309 | parser.add_option_group(video_format) 310 | parser.add_option_group(authentication) 311 | parser.add_option_group(postproc) 312 | 313 | xdg_config_home = os.environ.get('XDG_CONFIG_HOME') 314 | if xdg_config_home: 315 | userConf = os.path.join(xdg_config_home, 'youtube-dl.conf') 316 | else: 317 | userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf') 318 | argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:] 319 | opts, args = parser.parse_args(argv) 320 | 321 | return parser, opts, args 322 | 323 | def gen_extractors(): 324 | """ Return a list of an instance of every supported extractor. 325 | The order does matter; the first extractor matched is the one handling the URL. 326 | """ 327 | return [ 328 | YoutubePlaylistIE(), 329 | YoutubeUserIE(), 330 | YoutubeSearchIE(), 331 | YoutubeIE(), 332 | MetacafeIE(), 333 | DailymotionIE(), 334 | GoogleIE(), 335 | GoogleSearchIE(), 336 | PhotobucketIE(), 337 | YahooIE(), 338 | YahooSearchIE(), 339 | DepositFilesIE(), 340 | FacebookIE(), 341 | BlipTVIE(), 342 | VimeoIE(), 343 | MyVideoIE(), 344 | ComedyCentralIE(), 345 | EscapistIE(), 346 | CollegeHumorIE(), 347 | XVideosIE(), 348 | SoundcloudIE(), 349 | InfoQIE(), 350 | MixcloudIE(), 351 | StanfordOpenClassroomIE(), 352 | MTVIE(), 353 | 354 | GenericIE() 355 | ] 356 | 357 | def _real_main(): 358 | parser, opts, args = parseOpts() 359 | 360 | # Open appropriate CookieJar 361 | if opts.cookiefile is None: 362 | jar = cookielib.CookieJar() 363 | else: 364 | try: 365 | jar = cookielib.MozillaCookieJar(opts.cookiefile) 366 | if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK): 367 | jar.load() 368 | except (IOError, OSError), err: 369 | sys.exit(u'ERROR: unable to open cookie file') 370 | 371 | # Dump user agent 372 | if opts.dump_user_agent: 373 | print std_headers['User-Agent'] 374 | sys.exit(0) 375 | 376 | # Batch file verification 377 | batchurls = [] 378 | if opts.batchfile is not None: 379 | try: 380 | if opts.batchfile == '-': 381 | batchfd = sys.stdin 382 | else: 383 | batchfd = open(opts.batchfile, 'r') 384 | batchurls = batchfd.readlines() 385 | batchurls = [x.strip() for x in batchurls] 386 | batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] 387 | except IOError: 388 | sys.exit(u'ERROR: batch file could not be read') 389 | all_urls = batchurls + args 390 | all_urls = map(lambda url: url.strip(), all_urls) 391 | 392 | # General configuration 393 | cookie_processor = urllib2.HTTPCookieProcessor(jar) 394 | proxy_handler = urllib2.ProxyHandler() 395 | opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) 396 | urllib2.install_opener(opener) 397 | socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) 398 | 399 | extractors = gen_extractors() 400 | 401 | if opts.list_extractors: 402 | for ie in extractors: 403 | print(ie.IE_NAME) 404 | matchedUrls = filter(lambda url: ie.suitable(url), all_urls) 405 | all_urls = filter(lambda url: url not in matchedUrls, all_urls) 406 | for mu in matchedUrls: 407 | print(u' ' + mu) 408 | sys.exit(0) 409 | 410 | # Conflicting, missing and erroneous options 411 | if opts.usenetrc and (opts.username is not None or opts.password is not None): 412 | parser.error(u'using .netrc conflicts with giving username/password') 413 | if opts.password is not None and opts.username is None: 414 | parser.error(u'account username missing') 415 | if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber): 416 | parser.error(u'using output template conflicts with using title, literal title or auto number') 417 | if opts.usetitle and opts.useliteral: 418 | parser.error(u'using title conflicts with using literal title') 419 | if opts.username is not None and opts.password is None: 420 | opts.password = getpass.getpass(u'Type account password and press return:') 421 | if opts.ratelimit is not None: 422 | numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) 423 | if numeric_limit is None: 424 | parser.error(u'invalid rate limit specified') 425 | opts.ratelimit = numeric_limit 426 | if opts.retries is not None: 427 | try: 428 | opts.retries = long(opts.retries) 429 | except (TypeError, ValueError), err: 430 | parser.error(u'invalid retry count specified') 431 | try: 432 | opts.playliststart = int(opts.playliststart) 433 | if opts.playliststart <= 0: 434 | raise ValueError(u'Playlist start must be positive') 435 | except (TypeError, ValueError), err: 436 | parser.error(u'invalid playlist start number specified') 437 | try: 438 | opts.playlistend = int(opts.playlistend) 439 | if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart): 440 | raise ValueError(u'Playlist end must be greater than playlist start') 441 | except (TypeError, ValueError), err: 442 | parser.error(u'invalid playlist end number specified') 443 | if opts.extractaudio: 444 | if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']: 445 | parser.error(u'invalid audio format specified') 446 | 447 | # File downloader 448 | fd = FileDownloader({ 449 | 'usenetrc': opts.usenetrc, 450 | 'username': opts.username, 451 | 'password': opts.password, 452 | 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), 453 | 'forceurl': opts.geturl, 454 | 'forcetitle': opts.gettitle, 455 | 'forcethumbnail': opts.getthumbnail, 456 | 'forcedescription': opts.getdescription, 457 | 'forcefilename': opts.getfilename, 458 | 'forceformat': opts.getformat, 459 | 'simulate': opts.simulate, 460 | 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat), 461 | 'format': opts.format, 462 | 'format_limit': opts.format_limit, 463 | 'listformats': opts.listformats, 464 | 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding())) 465 | or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s') 466 | or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s') 467 | or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s') 468 | or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s') 469 | or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s') 470 | or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s') 471 | or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s') 472 | or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') 473 | or u'%(id)s.%(ext)s'), 474 | 'ignoreerrors': opts.ignoreerrors, 475 | 'ratelimit': opts.ratelimit, 476 | 'nooverwrites': opts.nooverwrites, 477 | 'retries': opts.retries, 478 | 'continuedl': opts.continue_dl, 479 | 'noprogress': opts.noprogress, 480 | 'playliststart': opts.playliststart, 481 | 'playlistend': opts.playlistend, 482 | 'logtostderr': opts.outtmpl == '-', 483 | 'consoletitle': opts.consoletitle, 484 | 'nopart': opts.nopart, 485 | 'updatetime': opts.updatetime, 486 | 'writedescription': opts.writedescription, 487 | 'writeinfojson': opts.writeinfojson, 488 | 'writesubtitles': opts.writesubtitles, 489 | 'subtitleslang': opts.subtitleslang, 490 | 'matchtitle': opts.matchtitle, 491 | 'rejecttitle': opts.rejecttitle, 492 | 'max_downloads': opts.max_downloads, 493 | 'prefer_free_formats': opts.prefer_free_formats, 494 | 'verbose': opts.verbose, 495 | }) 496 | 497 | if opts.verbose: 498 | fd.to_screen(u'[debug] Proxy map: ' + str(proxy_handler.proxies)) 499 | 500 | for extractor in extractors: 501 | fd.add_info_extractor(extractor) 502 | 503 | # PostProcessors 504 | if opts.extractaudio: 505 | fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo)) 506 | 507 | # Update version 508 | if opts.update_self: 509 | updateSelf(fd, sys.argv[0]) 510 | 511 | # Maybe do nothing 512 | if len(all_urls) < 1: 513 | if not opts.update_self: 514 | parser.error(u'you must provide at least one URL') 515 | else: 516 | sys.exit() 517 | 518 | try: 519 | retcode = fd.download(all_urls) 520 | except MaxDownloadsReached: 521 | fd.to_screen(u'--max-download limit reached, aborting.') 522 | retcode = 101 523 | 524 | # Dump cookie jar if requested 525 | if opts.cookiefile is not None: 526 | try: 527 | jar.save() 528 | except (IOError, OSError), err: 529 | sys.exit(u'ERROR: unable to save cookie jar') 530 | 531 | sys.exit(retcode) 532 | 533 | def main(): 534 | try: 535 | _real_main() 536 | except DownloadError: 537 | sys.exit(1) 538 | except SameFileError: 539 | sys.exit(u'ERROR: fixed output name but more than one file to download') 540 | except KeyboardInterrupt: 541 | sys.exit(u'\nERROR: Interrupted by user') 542 | -------------------------------------------------------------------------------- /youtube_dl/FileDownloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import httplib 5 | import math 6 | import os 7 | import re 8 | import socket 9 | import subprocess 10 | import sys 11 | import time 12 | import urllib2 13 | 14 | if os.name == 'nt': 15 | import ctypes 16 | 17 | from utils import * 18 | 19 | 20 | class FileDownloader(object): 21 | """File Downloader class. 22 | 23 | File downloader objects are the ones responsible of downloading the 24 | actual video file and writing it to disk if the user has requested 25 | it, among some other tasks. In most cases there should be one per 26 | program. As, given a video URL, the downloader doesn't know how to 27 | extract all the needed information, task that InfoExtractors do, it 28 | has to pass the URL to one of them. 29 | 30 | For this, file downloader objects have a method that allows 31 | InfoExtractors to be registered in a given order. When it is passed 32 | a URL, the file downloader handles it to the first InfoExtractor it 33 | finds that reports being able to handle it. The InfoExtractor extracts 34 | all the information about the video or videos the URL refers to, and 35 | asks the FileDownloader to process the video information, possibly 36 | downloading the video. 37 | 38 | File downloaders accept a lot of parameters. In order not to saturate 39 | the object constructor with arguments, it receives a dictionary of 40 | options instead. These options are available through the params 41 | attribute for the InfoExtractors to use. The FileDownloader also 42 | registers itself as the downloader in charge for the InfoExtractors 43 | that are added to it, so this is a "mutual registration". 44 | 45 | Available options: 46 | 47 | username: Username for authentication purposes. 48 | password: Password for authentication purposes. 49 | usenetrc: Use netrc for authentication instead. 50 | quiet: Do not print messages to stdout. 51 | forceurl: Force printing final URL. 52 | forcetitle: Force printing title. 53 | forcethumbnail: Force printing thumbnail URL. 54 | forcedescription: Force printing description. 55 | forcefilename: Force printing final filename. 56 | simulate: Do not download the video files. 57 | format: Video format code. 58 | format_limit: Highest quality format to try. 59 | outtmpl: Template for output names. 60 | ignoreerrors: Do not stop on download errors. 61 | ratelimit: Download speed limit, in bytes/sec. 62 | nooverwrites: Prevent overwriting files. 63 | retries: Number of times to retry for HTTP error 5xx 64 | continuedl: Try to continue downloads if possible. 65 | noprogress: Do not print the progress bar. 66 | playliststart: Playlist item to start at. 67 | playlistend: Playlist item to end at. 68 | matchtitle: Download only matching titles. 69 | rejecttitle: Reject downloads for matching titles. 70 | logtostderr: Log messages to stderr instead of stdout. 71 | consoletitle: Display progress in console window's titlebar. 72 | nopart: Do not use temporary .part files. 73 | updatetime: Use the Last-modified header to set output file timestamps. 74 | writedescription: Write the video description to a .description file 75 | writeinfojson: Write the video description to a .info.json file 76 | writesubtitles: Write the video subtitles to a .srt file 77 | subtitleslang: Language of the subtitles to download 78 | """ 79 | 80 | params = None 81 | _ies = [] 82 | _pps = [] 83 | _download_retcode = None 84 | _num_downloads = None 85 | _screen_file = None 86 | 87 | def __init__(self, params): 88 | """Create a FileDownloader object with the given options.""" 89 | self._ies = [] 90 | self._pps = [] 91 | self._download_retcode = 0 92 | self._num_downloads = 0 93 | self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] 94 | self.params = params 95 | 96 | @staticmethod 97 | def format_bytes(bytes): 98 | if bytes is None: 99 | return 'N/A' 100 | if type(bytes) is str: 101 | bytes = float(bytes) 102 | if bytes == 0.0: 103 | exponent = 0 104 | else: 105 | exponent = long(math.log(bytes, 1024.0)) 106 | suffix = 'bkMGTPEZY'[exponent] 107 | converted = float(bytes) / float(1024 ** exponent) 108 | return '%.2f%s' % (converted, suffix) 109 | 110 | @staticmethod 111 | def calc_percent(byte_counter, data_len): 112 | if data_len is None: 113 | return '---.-%' 114 | return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0)) 115 | 116 | @staticmethod 117 | def calc_eta(start, now, total, current): 118 | if total is None: 119 | return '--:--' 120 | dif = now - start 121 | if current == 0 or dif < 0.001: # One millisecond 122 | return '--:--' 123 | rate = float(current) / dif 124 | eta = long((float(total) - float(current)) / rate) 125 | (eta_mins, eta_secs) = divmod(eta, 60) 126 | if eta_mins > 99: 127 | return '--:--' 128 | return '%02d:%02d' % (eta_mins, eta_secs) 129 | 130 | @staticmethod 131 | def calc_speed(start, now, bytes): 132 | dif = now - start 133 | if bytes == 0 or dif < 0.001: # One millisecond 134 | return '%10s' % '---b/s' 135 | return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif)) 136 | 137 | @staticmethod 138 | def best_block_size(elapsed_time, bytes): 139 | new_min = max(bytes / 2.0, 1.0) 140 | new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB 141 | if elapsed_time < 0.001: 142 | return long(new_max) 143 | rate = bytes / elapsed_time 144 | if rate > new_max: 145 | return long(new_max) 146 | if rate < new_min: 147 | return long(new_min) 148 | return long(rate) 149 | 150 | @staticmethod 151 | def parse_bytes(bytestr): 152 | """Parse a string indicating a byte quantity into a long integer.""" 153 | matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) 154 | if matchobj is None: 155 | return None 156 | number = float(matchobj.group(1)) 157 | multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) 158 | return long(round(number * multiplier)) 159 | 160 | def add_info_extractor(self, ie): 161 | """Add an InfoExtractor object to the end of the list.""" 162 | self._ies.append(ie) 163 | ie.set_downloader(self) 164 | 165 | def add_post_processor(self, pp): 166 | """Add a PostProcessor object to the end of the chain.""" 167 | self._pps.append(pp) 168 | pp.set_downloader(self) 169 | 170 | def to_screen(self, message, skip_eol=False): 171 | """Print message to stdout if not in quiet mode.""" 172 | assert type(message) == type(u'') 173 | if not self.params.get('quiet', False): 174 | terminator = [u'\n', u''][skip_eol] 175 | output = message + terminator 176 | 177 | if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr 178 | output = output.encode(preferredencoding(), 'ignore') 179 | self._screen_file.write(output) 180 | self._screen_file.flush() 181 | 182 | def to_stderr(self, message): 183 | """Print message to stderr.""" 184 | print >>sys.stderr, message.encode(preferredencoding()) 185 | 186 | def to_cons_title(self, message): 187 | """Set console/terminal window title to message.""" 188 | if not self.params.get('consoletitle', False): 189 | return 190 | if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow(): 191 | # c_wchar_p() might not be necessary if `message` is 192 | # already of type unicode() 193 | ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) 194 | elif 'TERM' in os.environ: 195 | sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding())) 196 | 197 | def fixed_template(self): 198 | """Checks if the output template is fixed.""" 199 | return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None) 200 | 201 | def trouble(self, message=None): 202 | """Determine action to take when a download problem appears. 203 | 204 | Depending on if the downloader has been configured to ignore 205 | download errors or not, this method may throw an exception or 206 | not when errors are found, after printing the message. 207 | """ 208 | if message is not None: 209 | self.to_stderr(message) 210 | if not self.params.get('ignoreerrors', False): 211 | raise DownloadError(message) 212 | self._download_retcode = 1 213 | 214 | def slow_down(self, start_time, byte_counter): 215 | """Sleep if the download speed is over the rate limit.""" 216 | rate_limit = self.params.get('ratelimit', None) 217 | if rate_limit is None or byte_counter == 0: 218 | return 219 | now = time.time() 220 | elapsed = now - start_time 221 | if elapsed <= 0.0: 222 | return 223 | speed = float(byte_counter) / elapsed 224 | if speed > rate_limit: 225 | time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit) 226 | 227 | def temp_name(self, filename): 228 | """Returns a temporary filename for the given filename.""" 229 | if self.params.get('nopart', False) or filename == u'-' or \ 230 | (os.path.exists(encodeFilename(filename)) and not os.path.isfile(encodeFilename(filename))): 231 | return filename 232 | return filename + u'.part' 233 | 234 | def undo_temp_name(self, filename): 235 | if filename.endswith(u'.part'): 236 | return filename[:-len(u'.part')] 237 | return filename 238 | 239 | def try_rename(self, old_filename, new_filename): 240 | try: 241 | if old_filename == new_filename: 242 | return 243 | os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) 244 | except (IOError, OSError), err: 245 | self.trouble(u'ERROR: unable to rename file') 246 | 247 | def try_utime(self, filename, last_modified_hdr): 248 | """Try to set the last-modified time of the given file.""" 249 | if last_modified_hdr is None: 250 | return 251 | if not os.path.isfile(encodeFilename(filename)): 252 | return 253 | timestr = last_modified_hdr 254 | if timestr is None: 255 | return 256 | filetime = timeconvert(timestr) 257 | if filetime is None: 258 | return filetime 259 | try: 260 | os.utime(filename, (time.time(), filetime)) 261 | except: 262 | pass 263 | return filetime 264 | 265 | def report_writedescription(self, descfn): 266 | """ Report that the description file is being written """ 267 | self.to_screen(u'[info] Writing video description to: ' + descfn) 268 | 269 | def report_writesubtitles(self, srtfn): 270 | """ Report that the subtitles file is being written """ 271 | self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) 272 | 273 | def report_writeinfojson(self, infofn): 274 | """ Report that the metadata file has been written """ 275 | self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) 276 | 277 | def report_destination(self, filename): 278 | """Report destination filename.""" 279 | self.to_screen(u'[download] Destination: ' + filename) 280 | 281 | def report_progress(self, percent_str, data_len_str, speed_str, eta_str): 282 | """Report download progress.""" 283 | if self.params.get('noprogress', False): 284 | return 285 | self.to_screen(u'\r[download] %s of %s at %s ETA %s' % 286 | (percent_str, data_len_str, speed_str, eta_str), skip_eol=True) 287 | self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' % 288 | (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip())) 289 | 290 | def report_resuming_byte(self, resume_len): 291 | """Report attempt to resume at given byte.""" 292 | self.to_screen(u'[download] Resuming download at byte %s' % resume_len) 293 | 294 | def report_retry(self, count, retries): 295 | """Report retry in case of HTTP error 5xx""" 296 | self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries)) 297 | 298 | def report_file_already_downloaded(self, file_name): 299 | """Report file has already been fully downloaded.""" 300 | try: 301 | self.to_screen(u'[download] %s has already been downloaded' % file_name) 302 | except (UnicodeEncodeError), err: 303 | self.to_screen(u'[download] The file has already been downloaded') 304 | 305 | def report_unable_to_resume(self): 306 | """Report it was impossible to resume download.""" 307 | self.to_screen(u'[download] Unable to resume') 308 | 309 | def report_finish(self): 310 | """Report download finished.""" 311 | if self.params.get('noprogress', False): 312 | self.to_screen(u'[download] Download completed') 313 | else: 314 | self.to_screen(u'') 315 | 316 | def increment_downloads(self): 317 | """Increment the ordinal that assigns a number to each file.""" 318 | self._num_downloads += 1 319 | 320 | def prepare_filename(self, info_dict): 321 | """Generate the output filename.""" 322 | try: 323 | template_dict = dict(info_dict) 324 | template_dict['epoch'] = unicode(long(time.time())) 325 | template_dict['autonumber'] = unicode('%05d' % self._num_downloads) 326 | filename = self.params['outtmpl'] % template_dict 327 | return filename 328 | except (ValueError, KeyError), err: 329 | self.trouble(u'ERROR: invalid system charset or erroneous output template') 330 | return None 331 | 332 | def _match_entry(self, info_dict): 333 | """ Returns None iff the file should be downloaded """ 334 | 335 | title = info_dict['title'] 336 | matchtitle = self.params.get('matchtitle', False) 337 | if matchtitle and not re.search(matchtitle, title, re.IGNORECASE): 338 | return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' 339 | rejecttitle = self.params.get('rejecttitle', False) 340 | if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE): 341 | return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' 342 | return None 343 | 344 | def process_info(self, info_dict): 345 | """Process a single dictionary returned by an InfoExtractor.""" 346 | 347 | info_dict['stitle'] = sanitize_filename(info_dict['title']) 348 | 349 | reason = self._match_entry(info_dict) 350 | if reason is not None: 351 | self.to_screen(u'[download] ' + reason) 352 | return 353 | 354 | max_downloads = self.params.get('max_downloads') 355 | if max_downloads is not None: 356 | if self._num_downloads > int(max_downloads): 357 | raise MaxDownloadsReached() 358 | 359 | filename = self.prepare_filename(info_dict) 360 | 361 | # Forced printings 362 | if self.params.get('forcetitle', False): 363 | print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace') 364 | if self.params.get('forceurl', False): 365 | print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace') 366 | if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict: 367 | print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') 368 | if self.params.get('forcedescription', False) and 'description' in info_dict: 369 | print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace') 370 | if self.params.get('forcefilename', False) and filename is not None: 371 | print filename.encode(preferredencoding(), 'xmlcharrefreplace') 372 | if self.params.get('forceformat', False): 373 | print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace') 374 | 375 | # Do nothing else if in simulate mode 376 | if self.params.get('simulate', False): 377 | return 378 | 379 | if filename is None: 380 | return 381 | 382 | try: 383 | dn = os.path.dirname(encodeFilename(filename)) 384 | if dn != '' and not os.path.exists(dn): # dn is already encoded 385 | os.makedirs(dn) 386 | except (OSError, IOError), err: 387 | self.trouble(u'ERROR: unable to create directory ' + unicode(err)) 388 | return 389 | 390 | if self.params.get('writedescription', False): 391 | try: 392 | descfn = filename + u'.description' 393 | self.report_writedescription(descfn) 394 | descfile = open(encodeFilename(descfn), 'wb') 395 | try: 396 | descfile.write(info_dict['description'].encode('utf-8')) 397 | finally: 398 | descfile.close() 399 | except (OSError, IOError): 400 | self.trouble(u'ERROR: Cannot write description file ' + descfn) 401 | return 402 | 403 | if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: 404 | # subtitles download errors are already managed as troubles in relevant IE 405 | # that way it will silently go on when used with unsupporting IE 406 | try: 407 | srtfn = filename.rsplit('.', 1)[0] + u'.srt' 408 | self.report_writesubtitles(srtfn) 409 | srtfile = open(encodeFilename(srtfn), 'wb') 410 | try: 411 | srtfile.write(info_dict['subtitles'].encode('utf-8')) 412 | finally: 413 | srtfile.close() 414 | except (OSError, IOError): 415 | self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) 416 | return 417 | 418 | if self.params.get('writeinfojson', False): 419 | infofn = filename + u'.info.json' 420 | self.report_writeinfojson(infofn) 421 | try: 422 | json.dump 423 | except (NameError,AttributeError): 424 | self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.') 425 | return 426 | try: 427 | infof = open(encodeFilename(infofn), 'wb') 428 | try: 429 | json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',)) 430 | json.dump(json_info_dict, infof) 431 | finally: 432 | infof.close() 433 | except (OSError, IOError): 434 | self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn) 435 | return 436 | 437 | if not self.params.get('skip_download', False): 438 | if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)): 439 | success = True 440 | else: 441 | try: 442 | success = self._do_download(filename, info_dict) 443 | except (OSError, IOError), err: 444 | raise UnavailableVideoError 445 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 446 | self.trouble(u'ERROR: unable to download video data: %s' % str(err)) 447 | return 448 | except (ContentTooShortError, ), err: 449 | self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) 450 | return 451 | 452 | if success: 453 | try: 454 | self.post_process(filename, info_dict) 455 | except (PostProcessingError), err: 456 | self.trouble(u'ERROR: postprocessing: %s' % str(err)) 457 | return 458 | 459 | def download(self, url_list): 460 | """Download a given list of URLs.""" 461 | if len(url_list) > 1 and self.fixed_template(): 462 | raise SameFileError(self.params['outtmpl']) 463 | 464 | for url in url_list: 465 | suitable_found = False 466 | for ie in self._ies: 467 | # Go to next InfoExtractor if not suitable 468 | if not ie.suitable(url): 469 | continue 470 | 471 | # Suitable InfoExtractor found 472 | suitable_found = True 473 | 474 | # Extract information from URL and process it 475 | videos = ie.extract(url) 476 | for video in videos or []: 477 | try: 478 | self.increment_downloads() 479 | self.process_info(video) 480 | except UnavailableVideoError: 481 | self.trouble(u'\nERROR: unable to download video') 482 | 483 | # Suitable InfoExtractor had been found; go to next URL 484 | break 485 | 486 | if not suitable_found: 487 | self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url) 488 | 489 | return self._download_retcode 490 | 491 | def post_process(self, filename, ie_info): 492 | """Run the postprocessing chain on the given file.""" 493 | info = dict(ie_info) 494 | info['filepath'] = filename 495 | for pp in self._pps: 496 | info = pp.run(info) 497 | if info is None: 498 | break 499 | 500 | def _download_with_rtmpdump(self, filename, url, player_url): 501 | self.report_destination(filename) 502 | tmpfilename = self.temp_name(filename) 503 | 504 | # Check for rtmpdump first 505 | try: 506 | subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT) 507 | except (OSError, IOError): 508 | self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run') 509 | return False 510 | 511 | # Download using rtmpdump. rtmpdump returns exit code 2 when 512 | # the connection was interrumpted and resuming appears to be 513 | # possible. This is part of rtmpdump's normal usage, AFAIK. 514 | basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] 515 | args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)] 516 | if self.params.get('verbose', False): 517 | try: 518 | import pipes 519 | shell_quote = lambda args: ' '.join(map(pipes.quote, args)) 520 | except ImportError: 521 | shell_quote = repr 522 | self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args)) 523 | retval = subprocess.call(args) 524 | while retval == 2 or retval == 1: 525 | prevsize = os.path.getsize(encodeFilename(tmpfilename)) 526 | self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) 527 | time.sleep(5.0) # This seems to be needed 528 | retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) 529 | cursize = os.path.getsize(encodeFilename(tmpfilename)) 530 | if prevsize == cursize and retval == 1: 531 | break 532 | # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those 533 | if prevsize == cursize and retval == 2 and cursize > 1024: 534 | self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.') 535 | retval = 0 536 | break 537 | if retval == 0: 538 | self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(encodeFilename(tmpfilename))) 539 | self.try_rename(tmpfilename, filename) 540 | return True 541 | else: 542 | self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval) 543 | return False 544 | 545 | def _do_download(self, filename, info_dict): 546 | url = info_dict['url'] 547 | player_url = info_dict.get('player_url', None) 548 | 549 | # Check file already present 550 | if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False): 551 | self.report_file_already_downloaded(filename) 552 | return True 553 | 554 | # Attempt to download using rtmpdump 555 | if url.startswith('rtmp'): 556 | return self._download_with_rtmpdump(filename, url, player_url) 557 | 558 | tmpfilename = self.temp_name(filename) 559 | stream = None 560 | 561 | # Do not include the Accept-Encoding header 562 | headers = {'Youtubedl-no-compression': 'True'} 563 | basic_request = urllib2.Request(url, None, headers) 564 | request = urllib2.Request(url, None, headers) 565 | 566 | # Establish possible resume length 567 | if os.path.isfile(encodeFilename(tmpfilename)): 568 | resume_len = os.path.getsize(encodeFilename(tmpfilename)) 569 | else: 570 | resume_len = 0 571 | 572 | open_mode = 'wb' 573 | if resume_len != 0: 574 | if self.params.get('continuedl', False): 575 | self.report_resuming_byte(resume_len) 576 | request.add_header('Range','bytes=%d-' % resume_len) 577 | open_mode = 'ab' 578 | else: 579 | resume_len = 0 580 | 581 | count = 0 582 | retries = self.params.get('retries', 0) 583 | while count <= retries: 584 | # Establish connection 585 | try: 586 | if count == 0 and 'urlhandle' in info_dict: 587 | data = info_dict['urlhandle'] 588 | data = urllib2.urlopen(request) 589 | break 590 | except (urllib2.HTTPError, ), err: 591 | if (err.code < 500 or err.code >= 600) and err.code != 416: 592 | # Unexpected HTTP error 593 | raise 594 | elif err.code == 416: 595 | # Unable to resume (requested range not satisfiable) 596 | try: 597 | # Open the connection again without the range header 598 | data = urllib2.urlopen(basic_request) 599 | content_length = data.info()['Content-Length'] 600 | except (urllib2.HTTPError, ), err: 601 | if err.code < 500 or err.code >= 600: 602 | raise 603 | else: 604 | # Examine the reported length 605 | if (content_length is not None and 606 | (resume_len - 100 < long(content_length) < resume_len + 100)): 607 | # The file had already been fully downloaded. 608 | # Explanation to the above condition: in issue #175 it was revealed that 609 | # YouTube sometimes adds or removes a few bytes from the end of the file, 610 | # changing the file size slightly and causing problems for some users. So 611 | # I decided to implement a suggested change and consider the file 612 | # completely downloaded if the file size differs less than 100 bytes from 613 | # the one in the hard drive. 614 | self.report_file_already_downloaded(filename) 615 | self.try_rename(tmpfilename, filename) 616 | return True 617 | else: 618 | # The length does not match, we start the download over 619 | self.report_unable_to_resume() 620 | open_mode = 'wb' 621 | break 622 | # Retry 623 | count += 1 624 | if count <= retries: 625 | self.report_retry(count, retries) 626 | 627 | if count > retries: 628 | self.trouble(u'ERROR: giving up after %s retries' % retries) 629 | return False 630 | 631 | data_len = data.info().get('Content-length', None) 632 | if data_len is not None: 633 | data_len = long(data_len) + resume_len 634 | data_len_str = self.format_bytes(data_len) 635 | byte_counter = 0 + resume_len 636 | block_size = 1024 637 | start = time.time() 638 | while True: 639 | # Download and write 640 | before = time.time() 641 | data_block = data.read(block_size) 642 | after = time.time() 643 | if len(data_block) == 0: 644 | break 645 | byte_counter += len(data_block) 646 | 647 | # Open file just in time 648 | if stream is None: 649 | try: 650 | (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) 651 | assert stream is not None 652 | filename = self.undo_temp_name(tmpfilename) 653 | self.report_destination(filename) 654 | except (OSError, IOError), err: 655 | self.trouble(u'ERROR: unable to open for writing: %s' % str(err)) 656 | return False 657 | try: 658 | stream.write(data_block) 659 | except (IOError, OSError), err: 660 | self.trouble(u'\nERROR: unable to write data: %s' % str(err)) 661 | return False 662 | block_size = self.best_block_size(after - before, len(data_block)) 663 | 664 | # Progress message 665 | speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len) 666 | if data_len is None: 667 | self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA') 668 | else: 669 | percent_str = self.calc_percent(byte_counter, data_len) 670 | eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) 671 | self.report_progress(percent_str, data_len_str, speed_str, eta_str) 672 | 673 | # Apply rate limit 674 | self.slow_down(start, byte_counter - resume_len) 675 | 676 | if stream is None: 677 | self.trouble(u'\nERROR: Did not get any data blocks') 678 | return False 679 | stream.close() 680 | self.report_finish() 681 | if data_len is not None and byte_counter != data_len: 682 | raise ContentTooShortError(byte_counter, long(data_len)) 683 | self.try_rename(tmpfilename, filename) 684 | 685 | # Update file modification time 686 | if self.params.get('updatetime', True): 687 | info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) 688 | 689 | return True 690 | -------------------------------------------------------------------------------- /youtube_dl/InfoExtractors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import datetime 5 | import HTMLParser 6 | import httplib 7 | import netrc 8 | import os 9 | import re 10 | import socket 11 | import time 12 | import urllib 13 | import urllib2 14 | import email.utils 15 | import xml.etree.ElementTree 16 | from urlparse import parse_qs 17 | 18 | try: 19 | import cStringIO as StringIO 20 | except ImportError: 21 | import StringIO 22 | 23 | from utils import * 24 | 25 | 26 | class InfoExtractor(object): 27 | """Information Extractor class. 28 | 29 | Information extractors are the classes that, given a URL, extract 30 | information from the video (or videos) the URL refers to. This 31 | information includes the real video URL, the video title and simplified 32 | title, author and others. The information is stored in a dictionary 33 | which is then passed to the FileDownloader. The FileDownloader 34 | processes this information possibly downloading the video to the file 35 | system, among other possible outcomes. The dictionaries must include 36 | the following fields: 37 | 38 | id: Video identifier. 39 | url: Final video URL. 40 | uploader: Nickname of the video uploader. 41 | title: Literal title. 42 | ext: Video filename extension. 43 | format: Video format. 44 | player_url: SWF Player URL (may be None). 45 | 46 | The following fields are optional. Their primary purpose is to allow 47 | youtube-dl to serve as the backend for a video search function, such 48 | as the one in youtube2mp3. They are only used when their respective 49 | forced printing functions are called: 50 | 51 | thumbnail: Full URL to a video thumbnail image. 52 | description: One-line video description. 53 | 54 | Subclasses of this one should re-define the _real_initialize() and 55 | _real_extract() methods and define a _VALID_URL regexp. 56 | Probably, they should also be added to the list of extractors. 57 | """ 58 | 59 | _ready = False 60 | _downloader = None 61 | 62 | def __init__(self, downloader=None): 63 | """Constructor. Receives an optional downloader.""" 64 | self._ready = False 65 | self.set_downloader(downloader) 66 | 67 | def suitable(self, url): 68 | """Receives a URL and returns True if suitable for this IE.""" 69 | return re.match(self._VALID_URL, url) is not None 70 | 71 | def initialize(self): 72 | """Initializes an instance (authentication, etc).""" 73 | if not self._ready: 74 | self._real_initialize() 75 | self._ready = True 76 | 77 | def extract(self, url): 78 | """Extracts URL information and returns it in list of dicts.""" 79 | self.initialize() 80 | return self._real_extract(url) 81 | 82 | def set_downloader(self, downloader): 83 | """Sets the downloader for this IE.""" 84 | self._downloader = downloader 85 | 86 | def _real_initialize(self): 87 | """Real initialization process. Redefine in subclasses.""" 88 | pass 89 | 90 | def _real_extract(self, url): 91 | """Real extraction process. Redefine in subclasses.""" 92 | pass 93 | 94 | 95 | class YoutubeIE(InfoExtractor): 96 | """Information extractor for youtube.com.""" 97 | 98 | _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' 99 | _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' 100 | _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en' 101 | _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' 102 | _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' 103 | _NETRC_MACHINE = 'youtube' 104 | # Listed in order of quality 105 | _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] 106 | _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] 107 | _video_extensions = { 108 | '13': '3gp', 109 | '17': 'mp4', 110 | '18': 'mp4', 111 | '22': 'mp4', 112 | '37': 'mp4', 113 | '38': 'video', # You actually don't know if this will be MOV, AVI or whatever 114 | '43': 'webm', 115 | '44': 'webm', 116 | '45': 'webm', 117 | '46': 'webm', 118 | } 119 | _video_dimensions = { 120 | '5': '240x400', 121 | '6': '???', 122 | '13': '???', 123 | '17': '144x176', 124 | '18': '360x640', 125 | '22': '720x1280', 126 | '34': '360x640', 127 | '35': '480x854', 128 | '37': '1080x1920', 129 | '38': '3072x4096', 130 | '43': '360x640', 131 | '44': '480x854', 132 | '45': '720x1280', 133 | '46': '1080x1920', 134 | } 135 | IE_NAME = u'youtube' 136 | 137 | def report_lang(self): 138 | """Report attempt to set language.""" 139 | self._downloader.to_screen(u'[youtube] Setting language') 140 | 141 | def report_login(self): 142 | """Report attempt to log in.""" 143 | self._downloader.to_screen(u'[youtube] Logging in') 144 | 145 | def report_age_confirmation(self): 146 | """Report attempt to confirm age.""" 147 | self._downloader.to_screen(u'[youtube] Confirming age') 148 | 149 | def report_video_webpage_download(self, video_id): 150 | """Report attempt to download video webpage.""" 151 | self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id) 152 | 153 | def report_video_info_webpage_download(self, video_id): 154 | """Report attempt to download video info webpage.""" 155 | self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) 156 | 157 | def report_video_subtitles_download(self, video_id): 158 | """Report attempt to download video info webpage.""" 159 | self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) 160 | 161 | def report_information_extraction(self, video_id): 162 | """Report attempt to extract video information.""" 163 | self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) 164 | 165 | def report_unavailable_format(self, video_id, format): 166 | """Report extracted video URL.""" 167 | self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format)) 168 | 169 | def report_rtmp_download(self): 170 | """Indicate the download will use the RTMP protocol.""" 171 | self._downloader.to_screen(u'[youtube] RTMP download detected') 172 | 173 | def _closed_captions_xml_to_srt(self, xml_string): 174 | srt = '' 175 | texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) 176 | # TODO parse xml instead of regex 177 | for n, (start, dur_tag, dur, caption) in enumerate(texts): 178 | if not dur: dur = '4' 179 | start = float(start) 180 | end = start + float(dur) 181 | start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) 182 | end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) 183 | caption = unescapeHTML(caption) 184 | caption = unescapeHTML(caption) # double cycle, intentional 185 | srt += str(n+1) + '\n' 186 | srt += start + ' --> ' + end + '\n' 187 | srt += caption + '\n\n' 188 | return srt 189 | 190 | def _print_formats(self, formats): 191 | print 'Available formats:' 192 | for x in formats: 193 | print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')) 194 | 195 | def _real_initialize(self): 196 | if self._downloader is None: 197 | return 198 | 199 | username = None 200 | password = None 201 | downloader_params = self._downloader.params 202 | 203 | # Attempt to use provided username and password or .netrc data 204 | if downloader_params.get('username', None) is not None: 205 | username = downloader_params['username'] 206 | password = downloader_params['password'] 207 | elif downloader_params.get('usenetrc', False): 208 | try: 209 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) 210 | if info is not None: 211 | username = info[0] 212 | password = info[2] 213 | else: 214 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 215 | except (IOError, netrc.NetrcParseError), err: 216 | self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) 217 | return 218 | 219 | # Set language 220 | request = urllib2.Request(self._LANG_URL) 221 | try: 222 | self.report_lang() 223 | urllib2.urlopen(request).read() 224 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 225 | self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err)) 226 | return 227 | 228 | # No authentication to be performed 229 | if username is None: 230 | return 231 | 232 | # Log in 233 | login_form = { 234 | 'current_form': 'loginForm', 235 | 'next': '/', 236 | 'action_login': 'Log In', 237 | 'username': username, 238 | 'password': password, 239 | } 240 | request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) 241 | try: 242 | self.report_login() 243 | login_results = urllib2.urlopen(request).read() 244 | if re.search(r'(?i)]* name="loginForm"', login_results) is not None: 245 | self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password') 246 | return 247 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 248 | self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) 249 | return 250 | 251 | # Confirm age 252 | age_form = { 253 | 'next_url': '/', 254 | 'action_confirm': 'Confirm', 255 | } 256 | request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form)) 257 | try: 258 | self.report_age_confirmation() 259 | age_results = urllib2.urlopen(request).read() 260 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 261 | self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) 262 | return 263 | 264 | def _real_extract(self, url): 265 | # Extract original video URL from URL with redirection, like age verification, using next_url parameter 266 | mobj = re.search(self._NEXT_URL_RE, url) 267 | if mobj: 268 | url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/') 269 | 270 | # Extract video id from URL 271 | mobj = re.match(self._VALID_URL, url) 272 | if mobj is None: 273 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 274 | return 275 | video_id = mobj.group(2) 276 | 277 | # Get video webpage 278 | self.report_video_webpage_download(video_id) 279 | request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id) 280 | try: 281 | video_webpage = urllib2.urlopen(request).read() 282 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 283 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 284 | return 285 | 286 | # Attempt to extract SWF player URL 287 | mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) 288 | if mobj is not None: 289 | player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) 290 | else: 291 | player_url = None 292 | 293 | # Get video info 294 | self.report_video_info_webpage_download(video_id) 295 | for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: 296 | video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' 297 | % (video_id, el_type)) 298 | request = urllib2.Request(video_info_url) 299 | try: 300 | video_info_webpage = urllib2.urlopen(request).read() 301 | video_info = parse_qs(video_info_webpage) 302 | if 'token' in video_info: 303 | break 304 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 305 | self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err)) 306 | return 307 | if 'token' not in video_info: 308 | if 'reason' in video_info: 309 | self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8')) 310 | else: 311 | self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason') 312 | return 313 | 314 | # Check for "rental" videos 315 | if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: 316 | self._downloader.trouble(u'ERROR: "rental" videos not supported') 317 | return 318 | 319 | # Start extracting information 320 | self.report_information_extraction(video_id) 321 | 322 | # uploader 323 | if 'author' not in video_info: 324 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') 325 | return 326 | video_uploader = urllib.unquote_plus(video_info['author'][0]) 327 | 328 | # title 329 | if 'title' not in video_info: 330 | self._downloader.trouble(u'ERROR: unable to extract video title') 331 | return 332 | video_title = urllib.unquote_plus(video_info['title'][0]) 333 | video_title = video_title.decode('utf-8') 334 | 335 | # thumbnail image 336 | if 'thumbnail_url' not in video_info: 337 | self._downloader.trouble(u'WARNING: unable to extract video thumbnail') 338 | video_thumbnail = '' 339 | else: # don't panic if we can't find it 340 | video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0]) 341 | 342 | # upload date 343 | upload_date = u'NA' 344 | mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) 345 | if mobj is not None: 346 | upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) 347 | format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y'] 348 | for expression in format_expressions: 349 | try: 350 | upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d') 351 | except: 352 | pass 353 | 354 | # description 355 | video_description = get_element_by_id("eow-description", video_webpage.decode('utf8')) 356 | if video_description: video_description = clean_html(video_description) 357 | else: video_description = '' 358 | 359 | # closed captions 360 | video_subtitles = None 361 | if self._downloader.params.get('writesubtitles', False): 362 | try: 363 | self.report_video_subtitles_download(video_id) 364 | request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) 365 | try: 366 | srt_list = urllib2.urlopen(request).read() 367 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 368 | raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) 369 | srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) 370 | if not srt_lang_list: 371 | raise Trouble(u'WARNING: video has no closed captions') 372 | if self._downloader.params.get('subtitleslang', False): 373 | srt_lang = self._downloader.params.get('subtitleslang') 374 | elif 'en' in srt_lang_list: 375 | srt_lang = 'en' 376 | else: 377 | srt_lang = srt_lang_list[0] 378 | if not srt_lang in srt_lang_list: 379 | raise Trouble(u'WARNING: no closed captions found in the specified language') 380 | request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) 381 | try: 382 | srt_xml = urllib2.urlopen(request).read() 383 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 384 | raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) 385 | video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) 386 | except Trouble as trouble: 387 | self._downloader.trouble(trouble[0]) 388 | 389 | # token 390 | video_token = urllib.unquote_plus(video_info['token'][0]) 391 | 392 | # Decide which formats to download 393 | req_format = self._downloader.params.get('format', None) 394 | 395 | if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): 396 | self.report_rtmp_download() 397 | video_url_list = [(None, video_info['conn'][0])] 398 | elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: 399 | url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') 400 | url_data = [parse_qs(uds) for uds in url_data_strs] 401 | url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data) 402 | url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data) 403 | 404 | format_limit = self._downloader.params.get('format_limit', None) 405 | available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats 406 | if format_limit is not None and format_limit in available_formats: 407 | format_list = available_formats[available_formats.index(format_limit):] 408 | else: 409 | format_list = available_formats 410 | existing_formats = [x for x in format_list if x in url_map] 411 | if len(existing_formats) == 0: 412 | self._downloader.trouble(u'ERROR: no known formats available for video') 413 | return 414 | if self._downloader.params.get('listformats', None): 415 | self._print_formats(existing_formats) 416 | return 417 | if req_format is None or req_format == 'best': 418 | video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality 419 | elif req_format == 'worst': 420 | video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality 421 | elif req_format in ('-1', 'all'): 422 | video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats 423 | else: 424 | # Specific formats. We pick the first in a slash-delimeted sequence. 425 | # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. 426 | req_formats = req_format.split('/') 427 | video_url_list = None 428 | for rf in req_formats: 429 | if rf in url_map: 430 | video_url_list = [(rf, url_map[rf])] 431 | break 432 | if video_url_list is None: 433 | self._downloader.trouble(u'ERROR: requested format not available') 434 | return 435 | else: 436 | self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') 437 | return 438 | 439 | results = [] 440 | for format_param, video_real_url in video_url_list: 441 | # Extension 442 | video_extension = self._video_extensions.get(format_param, 'flv') 443 | 444 | results.append({ 445 | 'id': video_id.decode('utf-8'), 446 | 'url': video_real_url.decode('utf-8'), 447 | 'uploader': video_uploader.decode('utf-8'), 448 | 'upload_date': upload_date, 449 | 'title': video_title, 450 | 'ext': video_extension.decode('utf-8'), 451 | 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 452 | 'thumbnail': video_thumbnail.decode('utf-8'), 453 | 'description': video_description, 454 | 'player_url': player_url, 455 | 'subtitles': video_subtitles 456 | }) 457 | return results 458 | 459 | 460 | class MetacafeIE(InfoExtractor): 461 | """Information Extractor for metacafe.com.""" 462 | 463 | _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' 464 | _DISCLAIMER = 'http://www.metacafe.com/family_filter/' 465 | _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' 466 | IE_NAME = u'metacafe' 467 | 468 | def __init__(self, downloader=None): 469 | InfoExtractor.__init__(self, downloader) 470 | 471 | def report_disclaimer(self): 472 | """Report disclaimer retrieval.""" 473 | self._downloader.to_screen(u'[metacafe] Retrieving disclaimer') 474 | 475 | def report_age_confirmation(self): 476 | """Report attempt to confirm age.""" 477 | self._downloader.to_screen(u'[metacafe] Confirming age') 478 | 479 | def report_download_webpage(self, video_id): 480 | """Report webpage download.""" 481 | self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id) 482 | 483 | def report_extraction(self, video_id): 484 | """Report information extraction.""" 485 | self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id) 486 | 487 | def _real_initialize(self): 488 | # Retrieve disclaimer 489 | request = urllib2.Request(self._DISCLAIMER) 490 | try: 491 | self.report_disclaimer() 492 | disclaimer = urllib2.urlopen(request).read() 493 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 494 | self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err)) 495 | return 496 | 497 | # Confirm age 498 | disclaimer_form = { 499 | 'filters': '0', 500 | 'submit': "Continue - I'm over 18", 501 | } 502 | request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form)) 503 | try: 504 | self.report_age_confirmation() 505 | disclaimer = urllib2.urlopen(request).read() 506 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 507 | self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err)) 508 | return 509 | 510 | def _real_extract(self, url): 511 | # Extract id and simplified title from URL 512 | mobj = re.match(self._VALID_URL, url) 513 | if mobj is None: 514 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 515 | return 516 | 517 | video_id = mobj.group(1) 518 | 519 | # Check if video comes from YouTube 520 | mobj2 = re.match(r'^yt-(.*)$', video_id) 521 | if mobj2 is not None: 522 | self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) 523 | return 524 | 525 | # Retrieve video webpage to extract further information 526 | request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) 527 | try: 528 | self.report_download_webpage(video_id) 529 | webpage = urllib2.urlopen(request).read() 530 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 531 | self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) 532 | return 533 | 534 | # Extract URL, uploader and title from webpage 535 | self.report_extraction(video_id) 536 | mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) 537 | if mobj is not None: 538 | mediaURL = urllib.unquote(mobj.group(1)) 539 | video_extension = mediaURL[-3:] 540 | 541 | # Extract gdaKey if available 542 | mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) 543 | if mobj is None: 544 | video_url = mediaURL 545 | else: 546 | gdaKey = mobj.group(1) 547 | video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) 548 | else: 549 | mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) 550 | if mobj is None: 551 | self._downloader.trouble(u'ERROR: unable to extract media URL') 552 | return 553 | vardict = parse_qs(mobj.group(1)) 554 | if 'mediaData' not in vardict: 555 | self._downloader.trouble(u'ERROR: unable to extract media URL') 556 | return 557 | mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0]) 558 | if mobj is None: 559 | self._downloader.trouble(u'ERROR: unable to extract media URL') 560 | return 561 | mediaURL = mobj.group(1).replace('\\/', '/') 562 | video_extension = mediaURL[-3:] 563 | video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2)) 564 | 565 | mobj = re.search(r'(?im)(.*) - Video', webpage) 566 | if mobj is None: 567 | self._downloader.trouble(u'ERROR: unable to extract title') 568 | return 569 | video_title = mobj.group(1).decode('utf-8') 570 | 571 | mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) 572 | if mobj is None: 573 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') 574 | return 575 | video_uploader = mobj.group(1) 576 | 577 | return [{ 578 | 'id': video_id.decode('utf-8'), 579 | 'url': video_url.decode('utf-8'), 580 | 'uploader': video_uploader.decode('utf-8'), 581 | 'upload_date': u'NA', 582 | 'title': video_title, 583 | 'ext': video_extension.decode('utf-8'), 584 | 'format': u'NA', 585 | 'player_url': None, 586 | }] 587 | 588 | 589 | class DailymotionIE(InfoExtractor): 590 | """Information Extractor for Dailymotion""" 591 | 592 | _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' 593 | IE_NAME = u'dailymotion' 594 | 595 | def __init__(self, downloader=None): 596 | InfoExtractor.__init__(self, downloader) 597 | 598 | def report_download_webpage(self, video_id): 599 | """Report webpage download.""" 600 | self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id) 601 | 602 | def report_extraction(self, video_id): 603 | """Report information extraction.""" 604 | self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id) 605 | 606 | def _real_extract(self, url): 607 | # Extract id and simplified title from URL 608 | mobj = re.match(self._VALID_URL, url) 609 | if mobj is None: 610 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 611 | return 612 | 613 | video_id = mobj.group(1) 614 | 615 | video_extension = 'flv' 616 | 617 | # Retrieve video webpage to extract further information 618 | request = urllib2.Request(url) 619 | request.add_header('Cookie', 'family_filter=off') 620 | try: 621 | self.report_download_webpage(video_id) 622 | webpage = urllib2.urlopen(request).read() 623 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 624 | self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err)) 625 | return 626 | 627 | # Extract URL, uploader and title from webpage 628 | self.report_extraction(video_id) 629 | mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage) 630 | if mobj is None: 631 | self._downloader.trouble(u'ERROR: unable to extract media URL') 632 | return 633 | sequence = urllib.unquote(mobj.group(1)) 634 | mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence) 635 | if mobj is None: 636 | self._downloader.trouble(u'ERROR: unable to extract media URL') 637 | return 638 | mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '') 639 | 640 | # if needed add http://www.dailymotion.com/ if relative URL 641 | 642 | video_url = mediaURL 643 | 644 | mobj = re.search(r'', webpage) 645 | if mobj is None: 646 | self._downloader.trouble(u'ERROR: unable to extract title') 647 | return 648 | video_title = unescapeHTML(mobj.group('title').decode('utf-8')) 649 | 650 | mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) 651 | if mobj is None: 652 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') 653 | return 654 | video_uploader = mobj.group(1) 655 | 656 | return [{ 657 | 'id': video_id.decode('utf-8'), 658 | 'url': video_url.decode('utf-8'), 659 | 'uploader': video_uploader.decode('utf-8'), 660 | 'upload_date': u'NA', 661 | 'title': video_title, 662 | 'ext': video_extension.decode('utf-8'), 663 | 'format': u'NA', 664 | 'player_url': None, 665 | }] 666 | 667 | 668 | class GoogleIE(InfoExtractor): 669 | """Information extractor for video.google.com.""" 670 | 671 | _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' 672 | IE_NAME = u'video.google' 673 | 674 | def __init__(self, downloader=None): 675 | InfoExtractor.__init__(self, downloader) 676 | 677 | def report_download_webpage(self, video_id): 678 | """Report webpage download.""" 679 | self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id) 680 | 681 | def report_extraction(self, video_id): 682 | """Report information extraction.""" 683 | self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id) 684 | 685 | def _real_extract(self, url): 686 | # Extract id from URL 687 | mobj = re.match(self._VALID_URL, url) 688 | if mobj is None: 689 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 690 | return 691 | 692 | video_id = mobj.group(1) 693 | 694 | video_extension = 'mp4' 695 | 696 | # Retrieve video webpage to extract further information 697 | request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id) 698 | try: 699 | self.report_download_webpage(video_id) 700 | webpage = urllib2.urlopen(request).read() 701 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 702 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 703 | return 704 | 705 | # Extract URL, uploader, and title from webpage 706 | self.report_extraction(video_id) 707 | mobj = re.search(r"download_url:'([^']+)'", webpage) 708 | if mobj is None: 709 | video_extension = 'flv' 710 | mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage) 711 | if mobj is None: 712 | self._downloader.trouble(u'ERROR: unable to extract media URL') 713 | return 714 | mediaURL = urllib.unquote(mobj.group(1)) 715 | mediaURL = mediaURL.replace('\\x3d', '\x3d') 716 | mediaURL = mediaURL.replace('\\x26', '\x26') 717 | 718 | video_url = mediaURL 719 | 720 | mobj = re.search(r'(.*)', webpage) 721 | if mobj is None: 722 | self._downloader.trouble(u'ERROR: unable to extract title') 723 | return 724 | video_title = mobj.group(1).decode('utf-8') 725 | 726 | # Extract video description 727 | mobj = re.search(r'([^<]*)', webpage) 728 | if mobj is None: 729 | self._downloader.trouble(u'ERROR: unable to extract video description') 730 | return 731 | video_description = mobj.group(1).decode('utf-8') 732 | if not video_description: 733 | video_description = 'No description available.' 734 | 735 | # Extract video thumbnail 736 | if self._downloader.params.get('forcethumbnail', False): 737 | request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id))) 738 | try: 739 | webpage = urllib2.urlopen(request).read() 740 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 741 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 742 | return 743 | mobj = re.search(r'', webpage) 744 | if mobj is None: 745 | self._downloader.trouble(u'ERROR: unable to extract video thumbnail') 746 | return 747 | video_thumbnail = mobj.group(1) 748 | else: # we need something to pass to process_info 749 | video_thumbnail = '' 750 | 751 | return [{ 752 | 'id': video_id.decode('utf-8'), 753 | 'url': video_url.decode('utf-8'), 754 | 'uploader': u'NA', 755 | 'upload_date': u'NA', 756 | 'title': video_title, 757 | 'ext': video_extension.decode('utf-8'), 758 | 'format': u'NA', 759 | 'player_url': None, 760 | }] 761 | 762 | 763 | class PhotobucketIE(InfoExtractor): 764 | """Information extractor for photobucket.com.""" 765 | 766 | _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' 767 | IE_NAME = u'photobucket' 768 | 769 | def __init__(self, downloader=None): 770 | InfoExtractor.__init__(self, downloader) 771 | 772 | def report_download_webpage(self, video_id): 773 | """Report webpage download.""" 774 | self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id) 775 | 776 | def report_extraction(self, video_id): 777 | """Report information extraction.""" 778 | self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id) 779 | 780 | def _real_extract(self, url): 781 | # Extract id from URL 782 | mobj = re.match(self._VALID_URL, url) 783 | if mobj is None: 784 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 785 | return 786 | 787 | video_id = mobj.group(1) 788 | 789 | video_extension = 'flv' 790 | 791 | # Retrieve video webpage to extract further information 792 | request = urllib2.Request(url) 793 | try: 794 | self.report_download_webpage(video_id) 795 | webpage = urllib2.urlopen(request).read() 796 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 797 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 798 | return 799 | 800 | # Extract URL, uploader, and title from webpage 801 | self.report_extraction(video_id) 802 | mobj = re.search(r'', webpage) 803 | if mobj is None: 804 | self._downloader.trouble(u'ERROR: unable to extract media URL') 805 | return 806 | mediaURL = urllib.unquote(mobj.group(1)) 807 | 808 | video_url = mediaURL 809 | 810 | mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) 811 | if mobj is None: 812 | self._downloader.trouble(u'ERROR: unable to extract title') 813 | return 814 | video_title = mobj.group(1).decode('utf-8') 815 | 816 | video_uploader = mobj.group(2).decode('utf-8') 817 | 818 | return [{ 819 | 'id': video_id.decode('utf-8'), 820 | 'url': video_url.decode('utf-8'), 821 | 'uploader': video_uploader, 822 | 'upload_date': u'NA', 823 | 'title': video_title, 824 | 'ext': video_extension.decode('utf-8'), 825 | 'format': u'NA', 826 | 'player_url': None, 827 | }] 828 | 829 | 830 | class YahooIE(InfoExtractor): 831 | """Information extractor for video.yahoo.com.""" 832 | 833 | # _VALID_URL matches all Yahoo! Video URLs 834 | # _VPAGE_URL matches only the extractable '/watch/' URLs 835 | _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' 836 | _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' 837 | IE_NAME = u'video.yahoo' 838 | 839 | def __init__(self, downloader=None): 840 | InfoExtractor.__init__(self, downloader) 841 | 842 | def report_download_webpage(self, video_id): 843 | """Report webpage download.""" 844 | self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id) 845 | 846 | def report_extraction(self, video_id): 847 | """Report information extraction.""" 848 | self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id) 849 | 850 | def _real_extract(self, url, new_video=True): 851 | # Extract ID from URL 852 | mobj = re.match(self._VALID_URL, url) 853 | if mobj is None: 854 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 855 | return 856 | 857 | video_id = mobj.group(2) 858 | video_extension = 'flv' 859 | 860 | # Rewrite valid but non-extractable URLs as 861 | # extractable English language /watch/ URLs 862 | if re.match(self._VPAGE_URL, url) is None: 863 | request = urllib2.Request(url) 864 | try: 865 | webpage = urllib2.urlopen(request).read() 866 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 867 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 868 | return 869 | 870 | mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) 871 | if mobj is None: 872 | self._downloader.trouble(u'ERROR: Unable to extract id field') 873 | return 874 | yahoo_id = mobj.group(1) 875 | 876 | mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage) 877 | if mobj is None: 878 | self._downloader.trouble(u'ERROR: Unable to extract vid field') 879 | return 880 | yahoo_vid = mobj.group(1) 881 | 882 | url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id) 883 | return self._real_extract(url, new_video=False) 884 | 885 | # Retrieve video webpage to extract further information 886 | request = urllib2.Request(url) 887 | try: 888 | self.report_download_webpage(video_id) 889 | webpage = urllib2.urlopen(request).read() 890 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 891 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 892 | return 893 | 894 | # Extract uploader and title from webpage 895 | self.report_extraction(video_id) 896 | mobj = re.search(r'', webpage) 897 | if mobj is None: 898 | self._downloader.trouble(u'ERROR: unable to extract video title') 899 | return 900 | video_title = mobj.group(1).decode('utf-8') 901 | 902 | mobj = re.search(r'

(.*)

', webpage) 903 | if mobj is None: 904 | self._downloader.trouble(u'ERROR: unable to extract video uploader') 905 | return 906 | video_uploader = mobj.group(1).decode('utf-8') 907 | 908 | # Extract video thumbnail 909 | mobj = re.search(r'', webpage) 910 | if mobj is None: 911 | self._downloader.trouble(u'ERROR: unable to extract video thumbnail') 912 | return 913 | video_thumbnail = mobj.group(1).decode('utf-8') 914 | 915 | # Extract video description 916 | mobj = re.search(r'', webpage) 917 | if mobj is None: 918 | self._downloader.trouble(u'ERROR: unable to extract video description') 919 | return 920 | video_description = mobj.group(1).decode('utf-8') 921 | if not video_description: 922 | video_description = 'No description available.' 923 | 924 | # Extract video height and width 925 | mobj = re.search(r'', webpage) 926 | if mobj is None: 927 | self._downloader.trouble(u'ERROR: unable to extract video height') 928 | return 929 | yv_video_height = mobj.group(1) 930 | 931 | mobj = re.search(r'', webpage) 932 | if mobj is None: 933 | self._downloader.trouble(u'ERROR: unable to extract video width') 934 | return 935 | yv_video_width = mobj.group(1) 936 | 937 | # Retrieve video playlist to extract media URL 938 | # I'm not completely sure what all these options are, but we 939 | # seem to need most of them, otherwise the server sends a 401. 940 | yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents 941 | yv_bitrate = '700' # according to Wikipedia this is hard-coded 942 | request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id + 943 | '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + 944 | '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') 945 | try: 946 | self.report_download_webpage(video_id) 947 | webpage = urllib2.urlopen(request).read() 948 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 949 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 950 | return 951 | 952 | # Extract media URL from playlist XML 953 | mobj = re.search(r'[^:]*: (.*?)( \([^\(]*\))?', webpage) 1040 | if mobj is not None: 1041 | video_upload_date = mobj.group(1) 1042 | 1043 | # Vimeo specific: extract request signature and timestamp 1044 | sig = config['request']['signature'] 1045 | timestamp = config['request']['timestamp'] 1046 | 1047 | # Vimeo specific: extract video codec and quality information 1048 | # TODO bind to format param 1049 | codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] 1050 | for codec in codecs: 1051 | if codec[0] in config["video"]["files"]: 1052 | video_codec = codec[0] 1053 | video_extension = codec[1] 1054 | if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd' 1055 | else: quality = 'sd' 1056 | break 1057 | else: 1058 | self._downloader.trouble(u'ERROR: no known codec found') 1059 | return 1060 | 1061 | video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ 1062 | %(video_id, sig, timestamp, quality, video_codec.upper()) 1063 | 1064 | return [{ 1065 | 'id': video_id, 1066 | 'url': video_url, 1067 | 'uploader': video_uploader, 1068 | 'upload_date': video_upload_date, 1069 | 'title': video_title, 1070 | 'ext': video_extension, 1071 | 'thumbnail': video_thumbnail, 1072 | 'description': video_description, 1073 | 'player_url': None, 1074 | }] 1075 | 1076 | 1077 | class GenericIE(InfoExtractor): 1078 | """Generic last-resort information extractor.""" 1079 | 1080 | _VALID_URL = r'.*' 1081 | IE_NAME = u'generic' 1082 | 1083 | def __init__(self, downloader=None): 1084 | InfoExtractor.__init__(self, downloader) 1085 | 1086 | def report_download_webpage(self, video_id): 1087 | """Report webpage download.""" 1088 | self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') 1089 | self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id) 1090 | 1091 | def report_extraction(self, video_id): 1092 | """Report information extraction.""" 1093 | self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id) 1094 | 1095 | def report_following_redirect(self, new_url): 1096 | """Report information extraction.""" 1097 | self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) 1098 | 1099 | def _test_redirect(self, url): 1100 | """Check if it is a redirect, like url shorteners, in case restart chain.""" 1101 | class HeadRequest(urllib2.Request): 1102 | def get_method(self): 1103 | return "HEAD" 1104 | 1105 | class HEADRedirectHandler(urllib2.HTTPRedirectHandler): 1106 | """ 1107 | Subclass the HTTPRedirectHandler to make it use our 1108 | HeadRequest also on the redirected URL 1109 | """ 1110 | def redirect_request(self, req, fp, code, msg, headers, newurl): 1111 | if code in (301, 302, 303, 307): 1112 | newurl = newurl.replace(' ', '%20') 1113 | newheaders = dict((k,v) for k,v in req.headers.items() 1114 | if k.lower() not in ("content-length", "content-type")) 1115 | return HeadRequest(newurl, 1116 | headers=newheaders, 1117 | origin_req_host=req.get_origin_req_host(), 1118 | unverifiable=True) 1119 | else: 1120 | raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) 1121 | 1122 | class HTTPMethodFallback(urllib2.BaseHandler): 1123 | """ 1124 | Fallback to GET if HEAD is not allowed (405 HTTP error) 1125 | """ 1126 | def http_error_405(self, req, fp, code, msg, headers): 1127 | fp.read() 1128 | fp.close() 1129 | 1130 | newheaders = dict((k,v) for k,v in req.headers.items() 1131 | if k.lower() not in ("content-length", "content-type")) 1132 | return self.parent.open(urllib2.Request(req.get_full_url(), 1133 | headers=newheaders, 1134 | origin_req_host=req.get_origin_req_host(), 1135 | unverifiable=True)) 1136 | 1137 | # Build our opener 1138 | opener = urllib2.OpenerDirector() 1139 | for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, 1140 | HTTPMethodFallback, HEADRedirectHandler, 1141 | urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]: 1142 | opener.add_handler(handler()) 1143 | 1144 | response = opener.open(HeadRequest(url)) 1145 | new_url = response.geturl() 1146 | 1147 | if url == new_url: return False 1148 | 1149 | self.report_following_redirect(new_url) 1150 | self._downloader.download([new_url]) 1151 | return True 1152 | 1153 | def _real_extract(self, url): 1154 | if self._test_redirect(url): return 1155 | 1156 | video_id = url.split('/')[-1] 1157 | request = urllib2.Request(url) 1158 | try: 1159 | self.report_download_webpage(video_id) 1160 | webpage = urllib2.urlopen(request).read() 1161 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1162 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 1163 | return 1164 | except ValueError, err: 1165 | # since this is the last-resort InfoExtractor, if 1166 | # this error is thrown, it'll be thrown here 1167 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 1168 | return 1169 | 1170 | self.report_extraction(video_id) 1171 | # Start with something easy: JW Player in SWFObject 1172 | mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) 1173 | if mobj is None: 1174 | # Broaden the search a little bit 1175 | mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) 1176 | if mobj is None: 1177 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 1178 | return 1179 | 1180 | # It's possible that one of the regexes 1181 | # matched, but returned an empty group: 1182 | if mobj.group(1) is None: 1183 | self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 1184 | return 1185 | 1186 | video_url = urllib.unquote(mobj.group(1)) 1187 | video_id = os.path.basename(video_url) 1188 | 1189 | # here's a fun little line of code for you: 1190 | video_extension = os.path.splitext(video_id)[1][1:] 1191 | video_id = os.path.splitext(video_id)[0] 1192 | 1193 | # it's tempting to parse this further, but you would 1194 | # have to take into account all the variations like 1195 | # Video Title - Site Name 1196 | # Site Name | Video Title 1197 | # Video Title - Tagline | Site Name 1198 | # and so on and so forth; it's just not practical 1199 | mobj = re.search(r'(.*)', webpage) 1200 | if mobj is None: 1201 | self._downloader.trouble(u'ERROR: unable to extract title') 1202 | return 1203 | video_title = mobj.group(1).decode('utf-8') 1204 | 1205 | # video uploader is domain name 1206 | mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) 1207 | if mobj is None: 1208 | self._downloader.trouble(u'ERROR: unable to extract title') 1209 | return 1210 | video_uploader = mobj.group(1).decode('utf-8') 1211 | 1212 | return [{ 1213 | 'id': video_id.decode('utf-8'), 1214 | 'url': video_url.decode('utf-8'), 1215 | 'uploader': video_uploader, 1216 | 'upload_date': u'NA', 1217 | 'title': video_title, 1218 | 'ext': video_extension.decode('utf-8'), 1219 | 'format': u'NA', 1220 | 'player_url': None, 1221 | }] 1222 | 1223 | 1224 | class YoutubeSearchIE(InfoExtractor): 1225 | """Information Extractor for YouTube search queries.""" 1226 | _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' 1227 | _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' 1228 | _max_youtube_results = 1000 1229 | IE_NAME = u'youtube:search' 1230 | 1231 | def __init__(self, downloader=None): 1232 | InfoExtractor.__init__(self, downloader) 1233 | 1234 | def report_download_page(self, query, pagenum): 1235 | """Report attempt to download search page with given number.""" 1236 | query = query.decode(preferredencoding()) 1237 | self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) 1238 | 1239 | def _real_extract(self, query): 1240 | mobj = re.match(self._VALID_URL, query) 1241 | if mobj is None: 1242 | self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 1243 | return 1244 | 1245 | prefix, query = query.split(':') 1246 | prefix = prefix[8:] 1247 | query = query.encode('utf-8') 1248 | if prefix == '': 1249 | self._download_n_results(query, 1) 1250 | return 1251 | elif prefix == 'all': 1252 | self._download_n_results(query, self._max_youtube_results) 1253 | return 1254 | else: 1255 | try: 1256 | n = long(prefix) 1257 | if n <= 0: 1258 | self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 1259 | return 1260 | elif n > self._max_youtube_results: 1261 | self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) 1262 | n = self._max_youtube_results 1263 | self._download_n_results(query, n) 1264 | return 1265 | except ValueError: # parsing prefix as integer fails 1266 | self._download_n_results(query, 1) 1267 | return 1268 | 1269 | def _download_n_results(self, query, n): 1270 | """Downloads a specified number of results for a query""" 1271 | 1272 | video_ids = [] 1273 | pagenum = 0 1274 | limit = n 1275 | 1276 | while (50 * pagenum) < limit: 1277 | self.report_download_page(query, pagenum+1) 1278 | result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1) 1279 | request = urllib2.Request(result_url) 1280 | try: 1281 | data = urllib2.urlopen(request).read() 1282 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1283 | self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err)) 1284 | return 1285 | api_response = json.loads(data)['data'] 1286 | 1287 | new_ids = list(video['id'] for video in api_response['items']) 1288 | video_ids += new_ids 1289 | 1290 | limit = min(n, api_response['totalItems']) 1291 | pagenum += 1 1292 | 1293 | if len(video_ids) > n: 1294 | video_ids = video_ids[:n] 1295 | for id in video_ids: 1296 | self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) 1297 | return 1298 | 1299 | 1300 | class GoogleSearchIE(InfoExtractor): 1301 | """Information Extractor for Google Video search queries.""" 1302 | _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+' 1303 | _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' 1304 | _VIDEO_INDICATOR = r' self._max_google_results: 1339 | self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) 1340 | n = self._max_google_results 1341 | self._download_n_results(query, n) 1342 | return 1343 | except ValueError: # parsing prefix as integer fails 1344 | self._download_n_results(query, 1) 1345 | return 1346 | 1347 | def _download_n_results(self, query, n): 1348 | """Downloads a specified number of results for a query""" 1349 | 1350 | video_ids = [] 1351 | pagenum = 0 1352 | 1353 | while True: 1354 | self.report_download_page(query, pagenum) 1355 | result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10) 1356 | request = urllib2.Request(result_url) 1357 | try: 1358 | page = urllib2.urlopen(request).read() 1359 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1360 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 1361 | return 1362 | 1363 | # Extract video identifiers 1364 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): 1365 | video_id = mobj.group(1) 1366 | if video_id not in video_ids: 1367 | video_ids.append(video_id) 1368 | if len(video_ids) == n: 1369 | # Specified n videos reached 1370 | for id in video_ids: 1371 | self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id]) 1372 | return 1373 | 1374 | if re.search(self._MORE_PAGES_INDICATOR, page) is None: 1375 | for id in video_ids: 1376 | self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id]) 1377 | return 1378 | 1379 | pagenum = pagenum + 1 1380 | 1381 | 1382 | class YahooSearchIE(InfoExtractor): 1383 | """Information Extractor for Yahoo! Video search queries.""" 1384 | _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+' 1385 | _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' 1386 | _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"' 1387 | _MORE_PAGES_INDICATOR = r'\s*Next' 1388 | _max_yahoo_results = 1000 1389 | IE_NAME = u'video.yahoo:search' 1390 | 1391 | def __init__(self, downloader=None): 1392 | InfoExtractor.__init__(self, downloader) 1393 | 1394 | def report_download_page(self, query, pagenum): 1395 | """Report attempt to download playlist page with given number.""" 1396 | query = query.decode(preferredencoding()) 1397 | self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum)) 1398 | 1399 | def _real_extract(self, query): 1400 | mobj = re.match(self._VALID_URL, query) 1401 | if mobj is None: 1402 | self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 1403 | return 1404 | 1405 | prefix, query = query.split(':') 1406 | prefix = prefix[8:] 1407 | query = query.encode('utf-8') 1408 | if prefix == '': 1409 | self._download_n_results(query, 1) 1410 | return 1411 | elif prefix == 'all': 1412 | self._download_n_results(query, self._max_yahoo_results) 1413 | return 1414 | else: 1415 | try: 1416 | n = long(prefix) 1417 | if n <= 0: 1418 | self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 1419 | return 1420 | elif n > self._max_yahoo_results: 1421 | self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) 1422 | n = self._max_yahoo_results 1423 | self._download_n_results(query, n) 1424 | return 1425 | except ValueError: # parsing prefix as integer fails 1426 | self._download_n_results(query, 1) 1427 | return 1428 | 1429 | def _download_n_results(self, query, n): 1430 | """Downloads a specified number of results for a query""" 1431 | 1432 | video_ids = [] 1433 | already_seen = set() 1434 | pagenum = 1 1435 | 1436 | while True: 1437 | self.report_download_page(query, pagenum) 1438 | result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 1439 | request = urllib2.Request(result_url) 1440 | try: 1441 | page = urllib2.urlopen(request).read() 1442 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1443 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 1444 | return 1445 | 1446 | # Extract video identifiers 1447 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): 1448 | video_id = mobj.group(1) 1449 | if video_id not in already_seen: 1450 | video_ids.append(video_id) 1451 | already_seen.add(video_id) 1452 | if len(video_ids) == n: 1453 | # Specified n videos reached 1454 | for id in video_ids: 1455 | self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) 1456 | return 1457 | 1458 | if re.search(self._MORE_PAGES_INDICATOR, page) is None: 1459 | for id in video_ids: 1460 | self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) 1461 | return 1462 | 1463 | pagenum = pagenum + 1 1464 | 1465 | 1466 | class YoutubePlaylistIE(InfoExtractor): 1467 | """Information Extractor for YouTube playlists.""" 1468 | 1469 | _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' 1470 | _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' 1471 | _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&' 1472 | _MORE_PAGES_INDICATOR = r'yt-uix-pager-next' 1473 | IE_NAME = u'youtube:playlist' 1474 | 1475 | def __init__(self, downloader=None): 1476 | InfoExtractor.__init__(self, downloader) 1477 | 1478 | def report_download_page(self, playlist_id, pagenum): 1479 | """Report attempt to download playlist page with given number.""" 1480 | self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) 1481 | 1482 | def _real_extract(self, url): 1483 | # Extract playlist id 1484 | mobj = re.match(self._VALID_URL, url) 1485 | if mobj is None: 1486 | self._downloader.trouble(u'ERROR: invalid url: %s' % url) 1487 | return 1488 | 1489 | # Single video case 1490 | if mobj.group(3) is not None: 1491 | self._downloader.download([mobj.group(3)]) 1492 | return 1493 | 1494 | # Download playlist pages 1495 | # prefix is 'p' as default for playlists but there are other types that need extra care 1496 | playlist_prefix = mobj.group(1) 1497 | if playlist_prefix == 'a': 1498 | playlist_access = 'artist' 1499 | else: 1500 | playlist_prefix = 'p' 1501 | playlist_access = 'view_play_list' 1502 | playlist_id = mobj.group(2) 1503 | video_ids = [] 1504 | pagenum = 1 1505 | 1506 | while True: 1507 | self.report_download_page(playlist_id, pagenum) 1508 | url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum) 1509 | request = urllib2.Request(url) 1510 | try: 1511 | page = urllib2.urlopen(request).read() 1512 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1513 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 1514 | return 1515 | 1516 | # Extract video identifiers 1517 | ids_in_page = [] 1518 | for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): 1519 | if mobj.group(1) not in ids_in_page: 1520 | ids_in_page.append(mobj.group(1)) 1521 | video_ids.extend(ids_in_page) 1522 | 1523 | if re.search(self._MORE_PAGES_INDICATOR, page) is None: 1524 | break 1525 | pagenum = pagenum + 1 1526 | 1527 | playliststart = self._downloader.params.get('playliststart', 1) - 1 1528 | playlistend = self._downloader.params.get('playlistend', -1) 1529 | if playlistend == -1: 1530 | video_ids = video_ids[playliststart:] 1531 | else: 1532 | video_ids = video_ids[playliststart:playlistend] 1533 | 1534 | for id in video_ids: 1535 | self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) 1536 | return 1537 | 1538 | 1539 | class YoutubeUserIE(InfoExtractor): 1540 | """Information Extractor for YouTube users.""" 1541 | 1542 | _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' 1543 | _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' 1544 | _GDATA_PAGE_SIZE = 50 1545 | _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' 1546 | _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' 1547 | IE_NAME = u'youtube:user' 1548 | 1549 | def __init__(self, downloader=None): 1550 | InfoExtractor.__init__(self, downloader) 1551 | 1552 | def report_download_page(self, username, start_index): 1553 | """Report attempt to download user page.""" 1554 | self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % 1555 | (username, start_index, start_index + self._GDATA_PAGE_SIZE)) 1556 | 1557 | def _real_extract(self, url): 1558 | # Extract username 1559 | mobj = re.match(self._VALID_URL, url) 1560 | if mobj is None: 1561 | self._downloader.trouble(u'ERROR: invalid url: %s' % url) 1562 | return 1563 | 1564 | username = mobj.group(1) 1565 | 1566 | # Download video ids using YouTube Data API. Result size per 1567 | # query is limited (currently to 50 videos) so we need to query 1568 | # page by page until there are no video ids - it means we got 1569 | # all of them. 1570 | 1571 | video_ids = [] 1572 | pagenum = 0 1573 | 1574 | while True: 1575 | start_index = pagenum * self._GDATA_PAGE_SIZE + 1 1576 | self.report_download_page(username, start_index) 1577 | 1578 | request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)) 1579 | 1580 | try: 1581 | page = urllib2.urlopen(request).read() 1582 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1583 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 1584 | return 1585 | 1586 | # Extract video identifiers 1587 | ids_in_page = [] 1588 | 1589 | for mobj in re.finditer(self._VIDEO_INDICATOR, page): 1590 | if mobj.group(1) not in ids_in_page: 1591 | ids_in_page.append(mobj.group(1)) 1592 | 1593 | video_ids.extend(ids_in_page) 1594 | 1595 | # A little optimization - if current page is not 1596 | # "full", ie. does not contain PAGE_SIZE video ids then 1597 | # we can assume that this page is the last one - there 1598 | # are no more ids on further pages - no need to query 1599 | # again. 1600 | 1601 | if len(ids_in_page) < self._GDATA_PAGE_SIZE: 1602 | break 1603 | 1604 | pagenum += 1 1605 | 1606 | all_ids_count = len(video_ids) 1607 | playliststart = self._downloader.params.get('playliststart', 1) - 1 1608 | playlistend = self._downloader.params.get('playlistend', -1) 1609 | 1610 | if playlistend == -1: 1611 | video_ids = video_ids[playliststart:] 1612 | else: 1613 | video_ids = video_ids[playliststart:playlistend] 1614 | 1615 | self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" % 1616 | (username, all_ids_count, len(video_ids))) 1617 | 1618 | for video_id in video_ids: 1619 | self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) 1620 | 1621 | 1622 | class DepositFilesIE(InfoExtractor): 1623 | """Information extractor for depositfiles.com""" 1624 | 1625 | _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)' 1626 | IE_NAME = u'DepositFiles' 1627 | 1628 | def __init__(self, downloader=None): 1629 | InfoExtractor.__init__(self, downloader) 1630 | 1631 | def report_download_webpage(self, file_id): 1632 | """Report webpage download.""" 1633 | self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id) 1634 | 1635 | def report_extraction(self, file_id): 1636 | """Report information extraction.""" 1637 | self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id) 1638 | 1639 | def _real_extract(self, url): 1640 | file_id = url.split('/')[-1] 1641 | # Rebuild url in english locale 1642 | url = 'http://depositfiles.com/en/files/' + file_id 1643 | 1644 | # Retrieve file webpage with 'Free download' button pressed 1645 | free_download_indication = { 'gateway_result' : '1' } 1646 | request = urllib2.Request(url, urllib.urlencode(free_download_indication)) 1647 | try: 1648 | self.report_download_webpage(file_id) 1649 | webpage = urllib2.urlopen(request).read() 1650 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1651 | self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err)) 1652 | return 1653 | 1654 | # Search for the real file URL 1655 | mobj = re.search(r'
(Attention.*?)', webpage, re.DOTALL) 1659 | if (mobj is not None) and (mobj.group(1) is not None): 1660 | restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip() 1661 | self._downloader.trouble(u'ERROR: %s' % restriction_message) 1662 | else: 1663 | self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url) 1664 | return 1665 | 1666 | file_url = mobj.group(1) 1667 | file_extension = os.path.splitext(file_url)[1][1:] 1668 | 1669 | # Search for file title 1670 | mobj = re.search(r'', webpage) 1671 | if mobj is None: 1672 | self._downloader.trouble(u'ERROR: unable to extract title') 1673 | return 1674 | file_title = mobj.group(1).decode('utf-8') 1675 | 1676 | return [{ 1677 | 'id': file_id.decode('utf-8'), 1678 | 'url': file_url.decode('utf-8'), 1679 | 'uploader': u'NA', 1680 | 'upload_date': u'NA', 1681 | 'title': file_title, 1682 | 'ext': file_extension.decode('utf-8'), 1683 | 'format': u'NA', 1684 | 'player_url': None, 1685 | }] 1686 | 1687 | 1688 | class FacebookIE(InfoExtractor): 1689 | """Information Extractor for Facebook""" 1690 | 1691 | _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P\d+)(?:.*)' 1692 | _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' 1693 | _NETRC_MACHINE = 'facebook' 1694 | _available_formats = ['video', 'highqual', 'lowqual'] 1695 | _video_extensions = { 1696 | 'video': 'mp4', 1697 | 'highqual': 'mp4', 1698 | 'lowqual': 'mp4', 1699 | } 1700 | IE_NAME = u'facebook' 1701 | 1702 | def __init__(self, downloader=None): 1703 | InfoExtractor.__init__(self, downloader) 1704 | 1705 | def _reporter(self, message): 1706 | """Add header and report message.""" 1707 | self._downloader.to_screen(u'[facebook] %s' % message) 1708 | 1709 | def report_login(self): 1710 | """Report attempt to log in.""" 1711 | self._reporter(u'Logging in') 1712 | 1713 | def report_video_webpage_download(self, video_id): 1714 | """Report attempt to download video webpage.""" 1715 | self._reporter(u'%s: Downloading video webpage' % video_id) 1716 | 1717 | def report_information_extraction(self, video_id): 1718 | """Report attempt to extract video information.""" 1719 | self._reporter(u'%s: Extracting video information' % video_id) 1720 | 1721 | def _parse_page(self, video_webpage): 1722 | """Extract video information from page""" 1723 | # General data 1724 | data = {'title': r'\("video_title", "(.*?)"\)', 1725 | 'description': r'
(.*?)
', 1726 | 'owner': r'\("video_owner_name", "(.*?)"\)', 1727 | 'thumbnail': r'\("thumb_url", "(?P.*?)"\)', 1728 | } 1729 | video_info = {} 1730 | for piece in data.keys(): 1731 | mobj = re.search(data[piece], video_webpage) 1732 | if mobj is not None: 1733 | video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape")) 1734 | 1735 | # Video urls 1736 | video_urls = {} 1737 | for fmt in self._available_formats: 1738 | mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage) 1739 | if mobj is not None: 1740 | # URL is in a Javascript segment inside an escaped Unicode format within 1741 | # the generally utf-8 page 1742 | video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape")) 1743 | video_info['video_urls'] = video_urls 1744 | 1745 | return video_info 1746 | 1747 | def _real_initialize(self): 1748 | if self._downloader is None: 1749 | return 1750 | 1751 | useremail = None 1752 | password = None 1753 | downloader_params = self._downloader.params 1754 | 1755 | # Attempt to use provided username and password or .netrc data 1756 | if downloader_params.get('username', None) is not None: 1757 | useremail = downloader_params['username'] 1758 | password = downloader_params['password'] 1759 | elif downloader_params.get('usenetrc', False): 1760 | try: 1761 | info = netrc.netrc().authenticators(self._NETRC_MACHINE) 1762 | if info is not None: 1763 | useremail = info[0] 1764 | password = info[2] 1765 | else: 1766 | raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 1767 | except (IOError, netrc.NetrcParseError), err: 1768 | self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err)) 1769 | return 1770 | 1771 | if useremail is None: 1772 | return 1773 | 1774 | # Log in 1775 | login_form = { 1776 | 'email': useremail, 1777 | 'pass': password, 1778 | 'login': 'Log+In' 1779 | } 1780 | request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) 1781 | try: 1782 | self.report_login() 1783 | login_results = urllib2.urlopen(request).read() 1784 | if re.search(r'', login_results) is not None: 1785 | self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') 1786 | return 1787 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1788 | self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err)) 1789 | return 1790 | 1791 | def _real_extract(self, url): 1792 | mobj = re.match(self._VALID_URL, url) 1793 | if mobj is None: 1794 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 1795 | return 1796 | video_id = mobj.group('ID') 1797 | 1798 | # Get video webpage 1799 | self.report_video_webpage_download(video_id) 1800 | request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id) 1801 | try: 1802 | page = urllib2.urlopen(request) 1803 | video_webpage = page.read() 1804 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1805 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 1806 | return 1807 | 1808 | # Start extracting information 1809 | self.report_information_extraction(video_id) 1810 | 1811 | # Extract information 1812 | video_info = self._parse_page(video_webpage) 1813 | 1814 | # uploader 1815 | if 'owner' not in video_info: 1816 | self._downloader.trouble(u'ERROR: unable to extract uploader nickname') 1817 | return 1818 | video_uploader = video_info['owner'] 1819 | 1820 | # title 1821 | if 'title' not in video_info: 1822 | self._downloader.trouble(u'ERROR: unable to extract video title') 1823 | return 1824 | video_title = video_info['title'] 1825 | video_title = video_title.decode('utf-8') 1826 | 1827 | # thumbnail image 1828 | if 'thumbnail' not in video_info: 1829 | self._downloader.trouble(u'WARNING: unable to extract video thumbnail') 1830 | video_thumbnail = '' 1831 | else: 1832 | video_thumbnail = video_info['thumbnail'] 1833 | 1834 | # upload date 1835 | upload_date = u'NA' 1836 | if 'upload_date' in video_info: 1837 | upload_time = video_info['upload_date'] 1838 | timetuple = email.utils.parsedate_tz(upload_time) 1839 | if timetuple is not None: 1840 | try: 1841 | upload_date = time.strftime('%Y%m%d', timetuple[0:9]) 1842 | except: 1843 | pass 1844 | 1845 | # description 1846 | video_description = video_info.get('description', 'No description available.') 1847 | 1848 | url_map = video_info['video_urls'] 1849 | if len(url_map.keys()) > 0: 1850 | # Decide which formats to download 1851 | req_format = self._downloader.params.get('format', None) 1852 | format_limit = self._downloader.params.get('format_limit', None) 1853 | 1854 | if format_limit is not None and format_limit in self._available_formats: 1855 | format_list = self._available_formats[self._available_formats.index(format_limit):] 1856 | else: 1857 | format_list = self._available_formats 1858 | existing_formats = [x for x in format_list if x in url_map] 1859 | if len(existing_formats) == 0: 1860 | self._downloader.trouble(u'ERROR: no known formats available for video') 1861 | return 1862 | if req_format is None: 1863 | video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality 1864 | elif req_format == 'worst': 1865 | video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality 1866 | elif req_format == '-1': 1867 | video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats 1868 | else: 1869 | # Specific format 1870 | if req_format not in url_map: 1871 | self._downloader.trouble(u'ERROR: requested format not available') 1872 | return 1873 | video_url_list = [(req_format, url_map[req_format])] # Specific format 1874 | 1875 | results = [] 1876 | for format_param, video_real_url in video_url_list: 1877 | # Extension 1878 | video_extension = self._video_extensions.get(format_param, 'mp4') 1879 | 1880 | results.append({ 1881 | 'id': video_id.decode('utf-8'), 1882 | 'url': video_real_url.decode('utf-8'), 1883 | 'uploader': video_uploader.decode('utf-8'), 1884 | 'upload_date': upload_date, 1885 | 'title': video_title, 1886 | 'ext': video_extension.decode('utf-8'), 1887 | 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 1888 | 'thumbnail': video_thumbnail.decode('utf-8'), 1889 | 'description': video_description.decode('utf-8'), 1890 | 'player_url': None, 1891 | }) 1892 | return results 1893 | 1894 | class BlipTVIE(InfoExtractor): 1895 | """Information extractor for blip.tv""" 1896 | 1897 | _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$' 1898 | _URL_EXT = r'^.*\.([a-z0-9]+)$' 1899 | IE_NAME = u'blip.tv' 1900 | 1901 | def report_extraction(self, file_id): 1902 | """Report information extraction.""" 1903 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id)) 1904 | 1905 | def report_direct_download(self, title): 1906 | """Report information extraction.""" 1907 | self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title)) 1908 | 1909 | def _real_extract(self, url): 1910 | mobj = re.match(self._VALID_URL, url) 1911 | if mobj is None: 1912 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 1913 | return 1914 | 1915 | if '?' in url: 1916 | cchar = '&' 1917 | else: 1918 | cchar = '?' 1919 | json_url = url + cchar + 'skin=json&version=2&no_wrap=1' 1920 | request = urllib2.Request(json_url) 1921 | self.report_extraction(mobj.group(1)) 1922 | info = None 1923 | try: 1924 | urlh = urllib2.urlopen(request) 1925 | if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download 1926 | basename = url.split('/')[-1] 1927 | title,ext = os.path.splitext(basename) 1928 | title = title.decode('UTF-8') 1929 | ext = ext.replace('.', '') 1930 | self.report_direct_download(title) 1931 | info = { 1932 | 'id': title, 1933 | 'url': url, 1934 | 'title': title, 1935 | 'ext': ext, 1936 | 'urlhandle': urlh 1937 | } 1938 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1939 | self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err)) 1940 | return 1941 | if info is None: # Regular URL 1942 | try: 1943 | json_code = urlh.read() 1944 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 1945 | self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err)) 1946 | return 1947 | 1948 | try: 1949 | json_data = json.loads(json_code) 1950 | if 'Post' in json_data: 1951 | data = json_data['Post'] 1952 | else: 1953 | data = json_data 1954 | 1955 | upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') 1956 | video_url = data['media']['url'] 1957 | umobj = re.match(self._URL_EXT, video_url) 1958 | if umobj is None: 1959 | raise ValueError('Can not determine filename extension') 1960 | ext = umobj.group(1) 1961 | 1962 | info = { 1963 | 'id': data['item_id'], 1964 | 'url': video_url, 1965 | 'uploader': data['display_name'], 1966 | 'upload_date': upload_date, 1967 | 'title': data['title'], 1968 | 'ext': ext, 1969 | 'format': data['media']['mimeType'], 1970 | 'thumbnail': data['thumbnailUrl'], 1971 | 'description': data['description'], 1972 | 'player_url': data['embedUrl'] 1973 | } 1974 | except (ValueError,KeyError), err: 1975 | self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err)) 1976 | return 1977 | 1978 | return [info] 1979 | 1980 | 1981 | class MyVideoIE(InfoExtractor): 1982 | """Information Extractor for myvideo.de.""" 1983 | 1984 | _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' 1985 | IE_NAME = u'myvideo' 1986 | 1987 | def __init__(self, downloader=None): 1988 | InfoExtractor.__init__(self, downloader) 1989 | 1990 | def report_download_webpage(self, video_id): 1991 | """Report webpage download.""" 1992 | self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id) 1993 | 1994 | def report_extraction(self, video_id): 1995 | """Report information extraction.""" 1996 | self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id) 1997 | 1998 | def _real_extract(self,url): 1999 | mobj = re.match(self._VALID_URL, url) 2000 | if mobj is None: 2001 | self._download.trouble(u'ERROR: invalid URL: %s' % url) 2002 | return 2003 | 2004 | video_id = mobj.group(1) 2005 | 2006 | # Get video webpage 2007 | request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id) 2008 | try: 2009 | self.report_download_webpage(video_id) 2010 | webpage = urllib2.urlopen(request).read() 2011 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2012 | self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err)) 2013 | return 2014 | 2015 | self.report_extraction(video_id) 2016 | mobj = re.search(r'', 2017 | webpage) 2018 | if mobj is None: 2019 | self._downloader.trouble(u'ERROR: unable to extract media URL') 2020 | return 2021 | video_url = mobj.group(1) + ('/%s.flv' % video_id) 2022 | 2023 | mobj = re.search('([^<]+)', webpage) 2024 | if mobj is None: 2025 | self._downloader.trouble(u'ERROR: unable to extract title') 2026 | return 2027 | 2028 | video_title = mobj.group(1) 2029 | 2030 | return [{ 2031 | 'id': video_id, 2032 | 'url': video_url, 2033 | 'uploader': u'NA', 2034 | 'upload_date': u'NA', 2035 | 'title': video_title, 2036 | 'ext': u'flv', 2037 | 'format': u'NA', 2038 | 'player_url': None, 2039 | }] 2040 | 2041 | class ComedyCentralIE(InfoExtractor): 2042 | """Information extractor for The Daily Show and Colbert Report """ 2043 | 2044 | _VALID_URL = r'^(:(?Ptds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?Pthedailyshow|colbertnation)\.com/full-episodes/(?P.*)$' 2045 | IE_NAME = u'comedycentral' 2046 | 2047 | def report_extraction(self, episode_id): 2048 | self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) 2049 | 2050 | def report_config_download(self, episode_id): 2051 | self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id) 2052 | 2053 | def report_index_download(self, episode_id): 2054 | self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id) 2055 | 2056 | def report_player_url(self, episode_id): 2057 | self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id) 2058 | 2059 | def _real_extract(self, url): 2060 | mobj = re.match(self._VALID_URL, url) 2061 | if mobj is None: 2062 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2063 | return 2064 | 2065 | if mobj.group('shortname'): 2066 | if mobj.group('shortname') in ('tds', 'thedailyshow'): 2067 | url = u'http://www.thedailyshow.com/full-episodes/' 2068 | else: 2069 | url = u'http://www.colbertnation.com/full-episodes/' 2070 | mobj = re.match(self._VALID_URL, url) 2071 | assert mobj is not None 2072 | 2073 | dlNewest = not mobj.group('episode') 2074 | if dlNewest: 2075 | epTitle = mobj.group('showname') 2076 | else: 2077 | epTitle = mobj.group('episode') 2078 | 2079 | req = urllib2.Request(url) 2080 | self.report_extraction(epTitle) 2081 | try: 2082 | htmlHandle = urllib2.urlopen(req) 2083 | html = htmlHandle.read() 2084 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2085 | self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err)) 2086 | return 2087 | if dlNewest: 2088 | url = htmlHandle.geturl() 2089 | mobj = re.match(self._VALID_URL, url) 2090 | if mobj is None: 2091 | self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url) 2092 | return 2093 | if mobj.group('episode') == '': 2094 | self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url) 2095 | return 2096 | epTitle = mobj.group('episode') 2097 | 2098 | mMovieParams = re.findall('(?:[^/]+)/(?P[^/?]+)[/?]?.*$' 2178 | IE_NAME = u'escapist' 2179 | 2180 | def report_extraction(self, showName): 2181 | self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName) 2182 | 2183 | def report_config_download(self, showName): 2184 | self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) 2185 | 2186 | def _real_extract(self, url): 2187 | mobj = re.match(self._VALID_URL, url) 2188 | if mobj is None: 2189 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2190 | return 2191 | showName = mobj.group('showname') 2192 | videoId = mobj.group('episode') 2193 | 2194 | self.report_extraction(showName) 2195 | try: 2196 | webPage = urllib2.urlopen(url) 2197 | webPageBytes = webPage.read() 2198 | m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type']) 2199 | webPage = webPageBytes.decode(m.group(1) if m else 'utf-8') 2200 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2201 | self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err)) 2202 | return 2203 | 2204 | descMatch = re.search('[0-9]+)/(?P.*)$' 2252 | IE_NAME = u'collegehumor' 2253 | 2254 | def report_webpage(self, video_id): 2255 | """Report information extraction.""" 2256 | self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) 2257 | 2258 | def report_extraction(self, video_id): 2259 | """Report information extraction.""" 2260 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) 2261 | 2262 | def _real_extract(self, url): 2263 | mobj = re.match(self._VALID_URL, url) 2264 | if mobj is None: 2265 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2266 | return 2267 | video_id = mobj.group('videoid') 2268 | 2269 | self.report_webpage(video_id) 2270 | request = urllib2.Request(url) 2271 | try: 2272 | webpage = urllib2.urlopen(request).read() 2273 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2274 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 2275 | return 2276 | 2277 | m = re.search(r'id="video:(?P[0-9]+)"', webpage) 2278 | if m is None: 2279 | self._downloader.trouble(u'ERROR: Cannot extract internal video ID') 2280 | return 2281 | internal_video_id = m.group('internalvideoid') 2282 | 2283 | info = { 2284 | 'id': video_id, 2285 | 'internal_id': internal_video_id, 2286 | } 2287 | 2288 | self.report_extraction(video_id) 2289 | xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id 2290 | try: 2291 | metaXml = urllib2.urlopen(xmlUrl).read() 2292 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2293 | self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err)) 2294 | return 2295 | 2296 | mdoc = xml.etree.ElementTree.fromstring(metaXml) 2297 | try: 2298 | videoNode = mdoc.findall('./video')[0] 2299 | info['description'] = videoNode.findall('./description')[0].text 2300 | info['title'] = videoNode.findall('./caption')[0].text 2301 | info['url'] = videoNode.findall('./file')[0].text 2302 | info['thumbnail'] = videoNode.findall('./thumbnail')[0].text 2303 | info['ext'] = info['url'].rpartition('.')[2] 2304 | info['format'] = info['ext'] 2305 | except IndexError: 2306 | self._downloader.trouble(u'\nERROR: Invalid metadata XML file') 2307 | return 2308 | 2309 | return [info] 2310 | 2311 | 2312 | class XVideosIE(InfoExtractor): 2313 | """Information extractor for xvideos.com""" 2314 | 2315 | _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' 2316 | IE_NAME = u'xvideos' 2317 | 2318 | def report_webpage(self, video_id): 2319 | """Report information extraction.""" 2320 | self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) 2321 | 2322 | def report_extraction(self, video_id): 2323 | """Report information extraction.""" 2324 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) 2325 | 2326 | def _real_extract(self, url): 2327 | mobj = re.match(self._VALID_URL, url) 2328 | if mobj is None: 2329 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2330 | return 2331 | video_id = mobj.group(1).decode('utf-8') 2332 | 2333 | self.report_webpage(video_id) 2334 | 2335 | request = urllib2.Request(r'http://www.xvideos.com/video' + video_id) 2336 | try: 2337 | webpage = urllib2.urlopen(request).read() 2338 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2339 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 2340 | return 2341 | 2342 | self.report_extraction(video_id) 2343 | 2344 | 2345 | # Extract video URL 2346 | mobj = re.search(r'flv_url=(.+?)&', webpage) 2347 | if mobj is None: 2348 | self._downloader.trouble(u'ERROR: unable to extract video url') 2349 | return 2350 | video_url = urllib2.unquote(mobj.group(1).decode('utf-8')) 2351 | 2352 | 2353 | # Extract title 2354 | mobj = re.search(r'(.*?)\s+-\s+XVID', webpage) 2355 | if mobj is None: 2356 | self._downloader.trouble(u'ERROR: unable to extract video title') 2357 | return 2358 | video_title = mobj.group(1).decode('utf-8') 2359 | 2360 | 2361 | # Extract video thumbnail 2362 | mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage) 2363 | if mobj is None: 2364 | self._downloader.trouble(u'ERROR: unable to extract video thumbnail') 2365 | return 2366 | video_thumbnail = mobj.group(1).decode('utf-8') 2367 | 2368 | info = { 2369 | 'id': video_id, 2370 | 'url': video_url, 2371 | 'uploader': None, 2372 | 'upload_date': None, 2373 | 'title': video_title, 2374 | 'ext': 'flv', 2375 | 'format': 'flv', 2376 | 'thumbnail': video_thumbnail, 2377 | 'description': None, 2378 | 'player_url': None, 2379 | } 2380 | 2381 | return [info] 2382 | 2383 | 2384 | class SoundcloudIE(InfoExtractor): 2385 | """Information extractor for soundcloud.com 2386 | To access the media, the uid of the song and a stream token 2387 | must be extracted from the page source and the script must make 2388 | a request to media.soundcloud.com/crossdomain.xml. Then 2389 | the media can be grabbed by requesting from an url composed 2390 | of the stream token and uid 2391 | """ 2392 | 2393 | _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' 2394 | IE_NAME = u'soundcloud' 2395 | 2396 | def __init__(self, downloader=None): 2397 | InfoExtractor.__init__(self, downloader) 2398 | 2399 | def report_webpage(self, video_id): 2400 | """Report information extraction.""" 2401 | self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) 2402 | 2403 | def report_extraction(self, video_id): 2404 | """Report information extraction.""" 2405 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) 2406 | 2407 | def _real_extract(self, url): 2408 | mobj = re.match(self._VALID_URL, url) 2409 | if mobj is None: 2410 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2411 | return 2412 | 2413 | # extract uploader (which is in the url) 2414 | uploader = mobj.group(1).decode('utf-8') 2415 | # extract simple title (uploader + slug of song title) 2416 | slug_title = mobj.group(2).decode('utf-8') 2417 | simple_title = uploader + u'-' + slug_title 2418 | 2419 | self.report_webpage('%s/%s' % (uploader, slug_title)) 2420 | 2421 | request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title)) 2422 | try: 2423 | webpage = urllib2.urlopen(request).read() 2424 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2425 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 2426 | return 2427 | 2428 | self.report_extraction('%s/%s' % (uploader, slug_title)) 2429 | 2430 | # extract uid and stream token that soundcloud hands out for access 2431 | mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage) 2432 | if mobj: 2433 | video_id = mobj.group(1) 2434 | stream_token = mobj.group(2) 2435 | 2436 | # extract unsimplified title 2437 | mobj = re.search('"title":"(.*?)",', webpage) 2438 | if mobj: 2439 | title = mobj.group(1).decode('utf-8') 2440 | else: 2441 | title = simple_title 2442 | 2443 | # construct media url (with uid/token) 2444 | mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" 2445 | mediaURL = mediaURL % (video_id, stream_token) 2446 | 2447 | # description 2448 | description = u'No description available' 2449 | mobj = re.search('track-description-value"><p>(.*?)</p>', webpage) 2450 | if mobj: 2451 | description = mobj.group(1) 2452 | 2453 | # upload date 2454 | upload_date = None 2455 | mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage) 2456 | if mobj: 2457 | try: 2458 | upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') 2459 | except Exception, e: 2460 | self._downloader.to_stderr(str(e)) 2461 | 2462 | # for soundcloud, a request to a cross domain is required for cookies 2463 | request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers) 2464 | 2465 | return [{ 2466 | 'id': video_id.decode('utf-8'), 2467 | 'url': mediaURL, 2468 | 'uploader': uploader.decode('utf-8'), 2469 | 'upload_date': upload_date, 2470 | 'title': title, 2471 | 'ext': u'mp3', 2472 | 'format': u'NA', 2473 | 'player_url': None, 2474 | 'description': description.decode('utf-8') 2475 | }] 2476 | 2477 | 2478 | class InfoQIE(InfoExtractor): 2479 | """Information extractor for infoq.com""" 2480 | 2481 | _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' 2482 | IE_NAME = u'infoq' 2483 | 2484 | def report_webpage(self, video_id): 2485 | """Report information extraction.""" 2486 | self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) 2487 | 2488 | def report_extraction(self, video_id): 2489 | """Report information extraction.""" 2490 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) 2491 | 2492 | def _real_extract(self, url): 2493 | mobj = re.match(self._VALID_URL, url) 2494 | if mobj is None: 2495 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2496 | return 2497 | 2498 | self.report_webpage(url) 2499 | 2500 | request = urllib2.Request(url) 2501 | try: 2502 | webpage = urllib2.urlopen(request).read() 2503 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2504 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 2505 | return 2506 | 2507 | self.report_extraction(url) 2508 | 2509 | 2510 | # Extract video URL 2511 | mobj = re.search(r"jsclassref='([^']*)'", webpage) 2512 | if mobj is None: 2513 | self._downloader.trouble(u'ERROR: unable to extract video url') 2514 | return 2515 | video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64')) 2516 | 2517 | 2518 | # Extract title 2519 | mobj = re.search(r'contentTitle = "(.*?)";', webpage) 2520 | if mobj is None: 2521 | self._downloader.trouble(u'ERROR: unable to extract video title') 2522 | return 2523 | video_title = mobj.group(1).decode('utf-8') 2524 | 2525 | # Extract description 2526 | video_description = u'No description available.' 2527 | mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage) 2528 | if mobj is not None: 2529 | video_description = mobj.group(1).decode('utf-8') 2530 | 2531 | video_filename = video_url.split('/')[-1] 2532 | video_id, extension = video_filename.split('.') 2533 | 2534 | info = { 2535 | 'id': video_id, 2536 | 'url': video_url, 2537 | 'uploader': None, 2538 | 'upload_date': None, 2539 | 'title': video_title, 2540 | 'ext': extension, 2541 | 'format': extension, # Extension is always(?) mp4, but seems to be flv 2542 | 'thumbnail': None, 2543 | 'description': video_description, 2544 | 'player_url': None, 2545 | } 2546 | 2547 | return [info] 2548 | 2549 | class MixcloudIE(InfoExtractor): 2550 | """Information extractor for www.mixcloud.com""" 2551 | _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' 2552 | IE_NAME = u'mixcloud' 2553 | 2554 | def __init__(self, downloader=None): 2555 | InfoExtractor.__init__(self, downloader) 2556 | 2557 | def report_download_json(self, file_id): 2558 | """Report JSON download.""" 2559 | self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME) 2560 | 2561 | def report_extraction(self, file_id): 2562 | """Report information extraction.""" 2563 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id)) 2564 | 2565 | def get_urls(self, jsonData, fmt, bitrate='best'): 2566 | """Get urls from 'audio_formats' section in json""" 2567 | file_url = None 2568 | try: 2569 | bitrate_list = jsonData[fmt] 2570 | if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list: 2571 | bitrate = max(bitrate_list) # select highest 2572 | 2573 | url_list = jsonData[fmt][bitrate] 2574 | except TypeError: # we have no bitrate info. 2575 | url_list = jsonData[fmt] 2576 | return url_list 2577 | 2578 | def check_urls(self, url_list): 2579 | """Returns 1st active url from list""" 2580 | for url in url_list: 2581 | try: 2582 | urllib2.urlopen(url) 2583 | return url 2584 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2585 | url = None 2586 | 2587 | return None 2588 | 2589 | def _print_formats(self, formats): 2590 | print 'Available formats:' 2591 | for fmt in formats.keys(): 2592 | for b in formats[fmt]: 2593 | try: 2594 | ext = formats[fmt][b][0] 2595 | print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]) 2596 | except TypeError: # we have no bitrate info 2597 | ext = formats[fmt][0] 2598 | print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]) 2599 | break 2600 | 2601 | def _real_extract(self, url): 2602 | mobj = re.match(self._VALID_URL, url) 2603 | if mobj is None: 2604 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2605 | return 2606 | # extract uploader & filename from url 2607 | uploader = mobj.group(1).decode('utf-8') 2608 | file_id = uploader + "-" + mobj.group(2).decode('utf-8') 2609 | 2610 | # construct API request 2611 | file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json' 2612 | # retrieve .json file with links to files 2613 | request = urllib2.Request(file_url) 2614 | try: 2615 | self.report_download_json(file_url) 2616 | jsonData = urllib2.urlopen(request).read() 2617 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2618 | self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err)) 2619 | return 2620 | 2621 | # parse JSON 2622 | json_data = json.loads(jsonData) 2623 | player_url = json_data['player_swf_url'] 2624 | formats = dict(json_data['audio_formats']) 2625 | 2626 | req_format = self._downloader.params.get('format', None) 2627 | bitrate = None 2628 | 2629 | if self._downloader.params.get('listformats', None): 2630 | self._print_formats(formats) 2631 | return 2632 | 2633 | if req_format is None or req_format == 'best': 2634 | for format_param in formats.keys(): 2635 | url_list = self.get_urls(formats, format_param) 2636 | # check urls 2637 | file_url = self.check_urls(url_list) 2638 | if file_url is not None: 2639 | break # got it! 2640 | else: 2641 | if req_format not in formats.keys(): 2642 | self._downloader.trouble(u'ERROR: format is not available') 2643 | return 2644 | 2645 | url_list = self.get_urls(formats, req_format) 2646 | file_url = self.check_urls(url_list) 2647 | format_param = req_format 2648 | 2649 | return [{ 2650 | 'id': file_id.decode('utf-8'), 2651 | 'url': file_url.decode('utf-8'), 2652 | 'uploader': uploader.decode('utf-8'), 2653 | 'upload_date': u'NA', 2654 | 'title': json_data['name'], 2655 | 'ext': file_url.split('.')[-1].decode('utf-8'), 2656 | 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 2657 | 'thumbnail': json_data['thumbnail_url'], 2658 | 'description': json_data['description'], 2659 | 'player_url': player_url.decode('utf-8'), 2660 | }] 2661 | 2662 | class StanfordOpenClassroomIE(InfoExtractor): 2663 | """Information extractor for Stanford's Open ClassRoom""" 2664 | 2665 | _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 2666 | IE_NAME = u'stanfordoc' 2667 | 2668 | def report_download_webpage(self, objid): 2669 | """Report information extraction.""" 2670 | self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid)) 2671 | 2672 | def report_extraction(self, video_id): 2673 | """Report information extraction.""" 2674 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) 2675 | 2676 | def _real_extract(self, url): 2677 | mobj = re.match(self._VALID_URL, url) 2678 | if mobj is None: 2679 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2680 | return 2681 | 2682 | if mobj.group('course') and mobj.group('video'): # A specific video 2683 | course = mobj.group('course') 2684 | video = mobj.group('video') 2685 | info = { 2686 | 'id': course + '_' + video, 2687 | } 2688 | 2689 | self.report_extraction(info['id']) 2690 | baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/' 2691 | xmlUrl = baseUrl + video + '.xml' 2692 | try: 2693 | metaXml = urllib2.urlopen(xmlUrl).read() 2694 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2695 | self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err)) 2696 | return 2697 | mdoc = xml.etree.ElementTree.fromstring(metaXml) 2698 | try: 2699 | info['title'] = mdoc.findall('./title')[0].text 2700 | info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text 2701 | except IndexError: 2702 | self._downloader.trouble(u'\nERROR: Invalid metadata XML file') 2703 | return 2704 | info['ext'] = info['url'].rpartition('.')[2] 2705 | info['format'] = info['ext'] 2706 | return [info] 2707 | elif mobj.group('course'): # A course page 2708 | course = mobj.group('course') 2709 | info = { 2710 | 'id': course, 2711 | 'type': 'playlist', 2712 | } 2713 | 2714 | self.report_download_webpage(info['id']) 2715 | try: 2716 | coursepage = urllib2.urlopen(url).read() 2717 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2718 | self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) 2719 | return 2720 | 2721 | m = re.search('<h1>([^<]+)</h1>', coursepage) 2722 | if m: 2723 | info['title'] = unescapeHTML(m.group(1)) 2724 | else: 2725 | info['title'] = info['id'] 2726 | 2727 | m = re.search('<description>([^<]+)</description>', coursepage) 2728 | if m: 2729 | info['description'] = unescapeHTML(m.group(1)) 2730 | 2731 | links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) 2732 | info['list'] = [ 2733 | { 2734 | 'type': 'reference', 2735 | 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), 2736 | } 2737 | for vpage in links] 2738 | results = [] 2739 | for entry in info['list']: 2740 | assert entry['type'] == 'reference' 2741 | results += self.extract(entry['url']) 2742 | return results 2743 | 2744 | else: # Root page 2745 | info = { 2746 | 'id': 'Stanford OpenClassroom', 2747 | 'type': 'playlist', 2748 | } 2749 | 2750 | self.report_download_webpage(info['id']) 2751 | rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 2752 | try: 2753 | rootpage = urllib2.urlopen(rootURL).read() 2754 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2755 | self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) 2756 | return 2757 | 2758 | info['title'] = info['id'] 2759 | 2760 | links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) 2761 | info['list'] = [ 2762 | { 2763 | 'type': 'reference', 2764 | 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), 2765 | } 2766 | for cpage in links] 2767 | 2768 | results = [] 2769 | for entry in info['list']: 2770 | assert entry['type'] == 'reference' 2771 | results += self.extract(entry['url']) 2772 | return results 2773 | 2774 | class MTVIE(InfoExtractor): 2775 | """Information extractor for MTV.com""" 2776 | 2777 | _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' 2778 | IE_NAME = u'mtv' 2779 | 2780 | def report_webpage(self, video_id): 2781 | """Report information extraction.""" 2782 | self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) 2783 | 2784 | def report_extraction(self, video_id): 2785 | """Report information extraction.""" 2786 | self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) 2787 | 2788 | def _real_extract(self, url): 2789 | mobj = re.match(self._VALID_URL, url) 2790 | if mobj is None: 2791 | self._downloader.trouble(u'ERROR: invalid URL: %s' % url) 2792 | return 2793 | if not mobj.group('proto'): 2794 | url = 'http://' + url 2795 | video_id = mobj.group('videoid') 2796 | self.report_webpage(video_id) 2797 | 2798 | request = urllib2.Request(url) 2799 | try: 2800 | webpage = urllib2.urlopen(request).read() 2801 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2802 | self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) 2803 | return 2804 | 2805 | mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) 2806 | if mobj is None: 2807 | self._downloader.trouble(u'ERROR: unable to extract song name') 2808 | return 2809 | song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) 2810 | mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) 2811 | if mobj is None: 2812 | self._downloader.trouble(u'ERROR: unable to extract performer') 2813 | return 2814 | performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) 2815 | video_title = performer + ' - ' + song_name 2816 | 2817 | mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) 2818 | if mobj is None: 2819 | self._downloader.trouble(u'ERROR: unable to mtvn_uri') 2820 | return 2821 | mtvn_uri = mobj.group(1) 2822 | 2823 | mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) 2824 | if mobj is None: 2825 | self._downloader.trouble(u'ERROR: unable to extract content id') 2826 | return 2827 | content_id = mobj.group(1) 2828 | 2829 | videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri 2830 | self.report_extraction(video_id) 2831 | request = urllib2.Request(videogen_url) 2832 | try: 2833 | metadataXml = urllib2.urlopen(request).read() 2834 | except (urllib2.URLError, httplib.HTTPException, socket.error), err: 2835 | self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err)) 2836 | return 2837 | 2838 | mdoc = xml.etree.ElementTree.fromstring(metadataXml) 2839 | renditions = mdoc.findall('.//rendition') 2840 | 2841 | # For now, always pick the highest quality. 2842 | rendition = renditions[-1] 2843 | 2844 | try: 2845 | _,_,ext = rendition.attrib['type'].partition('/') 2846 | format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] 2847 | video_url = rendition.find('./src').text 2848 | except KeyError: 2849 | self._downloader.trouble('Invalid rendition field.') 2850 | return 2851 | 2852 | info = { 2853 | 'id': video_id, 2854 | 'url': video_url, 2855 | 'uploader': performer, 2856 | 'title': video_title, 2857 | 'ext': ext, 2858 | 'format': format, 2859 | } 2860 | 2861 | return [info] 2862 | --------------------------------------------------------------------------------