├── .gitignore ├── .travis.yml ├── README.md ├── beetsplug ├── __init__.py └── ydl.py ├── env.config.yml ├── env.develop ├── requirements.txt ├── setup.py └── test /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | 3 | /build 4 | /dist 5 | /env/* 6 | env.lib.db 7 | *.egg-info 8 | 9 | __pycache__ 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.5" 4 | - "3.6" 5 | install: 6 | - python setup.py install 7 | script: 8 | - ./test -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Travis CI](https://api.travis-ci.org/vmassuchetto/beets-ydl.svg?branch=master) 2 | 3 | # beets ydl 4 | 5 | Download audio from youtube-dl sources and import into beets 6 | 7 | $ beet ydl "https://www.youtube.com/watch?v=wW6ykueIhX8" 8 | 9 | $ beet ls short music for short people 10 | 11 | 59 Times the Pain - Short Music for Short People - We Want the Kids 12 | 7 Seconds - Short Music for Short People - F.O.F.O.D. 13 | 88 Fingers Louie - Short Music for Short People - All My Friends Are in Popular Bands 14 | Adrenalin O.D. - Short Music for Short People - Your Kung Fu Is Old... And Now You Must Die! 15 | Aerobitch - Short Music for Short People - Steamroller Blues 16 | [...] 17 | 18 | ## Installation 19 | 20 | pip install beets-ydl 21 | 22 | And enable `ydl` plugin on your `config.yaml` file. 23 | 24 | ## Configuration 25 | 26 | Available options and default values on `config.yaml`: 27 | 28 | ```yml 29 | plugins: ydl 30 | 31 | ydl: 32 | download: True # download files from sources after getting information, 33 | split_files: True # try to split album files into separate tracks, 34 | import: True # import files on youtube-dl after downloading and splitting, 35 | youtubedl_options: {} # youtube-dl available options -- https://git.io/fN0c7 36 | urls: [] # list of default urls to download when no arguments are provided, you 37 | # can provide a playlist to get checked every time 38 | ``` 39 | 40 | ## How it works 41 | 42 | The plugin main goal is to deliver an importable file set to the `beet import` 43 | command, so it will download an audio file, look for a tracklist with track 44 | times in the video description, split the file into per-track files, assign 45 | some basic ID3 tags to them, and finally run `beet import` on 46 | `${BEETS_CONFIG}/ydl-cache/${VIDEO_ID}` directory. 47 | 48 | ## Tips 49 | 50 | - The video title can trick beets to find the correct album, in this case you'll 51 | have to manually enter a search term 52 | 53 | - Use the `bandcamp` plugin for better results 54 | 55 | - Use a `.netrc` file to use your own YouTube playlists 56 | 57 | Security discussions apart, you can create a `~/.netrc` with credentials for 58 | youtube-dl to read. 59 | 60 | machine youtube login somelogin@gmail.com password somepassword 61 | 62 | Check [this entry](https://git.io/fN2TD) on youtube-dl docs for more 63 | information. 64 | 65 | Like this, you can download private playlists or your subscriptions: 66 | 67 | beet ydl "https://www.youtube.com/feed/subscriptions" 68 | 69 | - Download and import later 70 | 71 | To download and split files without importing into beets: 72 | 73 | beet ydl "" --keep-files --no-import 74 | 75 | And later, to import: 76 | 77 | beet ydl "" --no-download --no-split-files 78 | 79 | Like this, you can download a big playlist and then run the beets import 80 | routine, which requires manual intervention. 81 | 82 | - (possibly) enhance audio quality 83 | 84 | beets-ydl uses a proposed [192kbps extractor 'bestaudio'](https://git.io/fN2mJ) 85 | format because it is more likely that it will find separate audio files on 86 | sources. Some high quality videos might have better audio quality embedded, so 87 | it can also make sense to set a higher quality extractor: 88 | 89 | ```yaml 90 | ydl: 91 | youtubedl_options: 92 | format: 'best', 93 | postprocessors: 94 | key: 'FFmpegExtractAudio' 95 | preferredcodec: 'mp3' 96 | preferredquality: '320' 97 | nopostoverwrites: True 98 | ``` 99 | 100 | This can, however, end-up with unnecessarily big files that have 320kbps as a 101 | merely nominal quality. See [this discussion](https://askubuntu.com/q/634584). 102 | 103 | ## Development 104 | 105 | Execute the env script to get into a virtualenv. 106 | 107 | . ./env.develop 108 | -------------------------------------------------------------------------------- /beetsplug/__init__.py: -------------------------------------------------------------------------------- 1 | from pkgutil import extend_path 2 | __path__ = extend_path(__path__, __name__) -------------------------------------------------------------------------------- /beetsplug/ydl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2016, Vinicius Massuchetto. 3 | # 4 | # Permission is hereby granted, free of charge, to any person obtaining 5 | # a copy of this software and associated documentation files (the 6 | # "Software"), to deal in the Software without restriction, including 7 | # without limitation the rights to use, copy, modify, merge, publish, 8 | # distribute, sublicense, and/or sell copies of the Software, and to 9 | # permit persons to whom the Software is furnished to do so, subject to 10 | # the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be 13 | # included in all copies or substantial portions of the Software. 14 | 15 | from beets import config 16 | from beets import ui 17 | from beets.plugins import BeetsPlugin 18 | from optparse import OptionParser 19 | from pathlib import Path 20 | from shutil import copyfile 21 | from xdg import BaseDirectory 22 | from youtube_dl import YoutubeDL 23 | from hashlib import md5 24 | import glob 25 | import json 26 | import os 27 | import re 28 | import shutil 29 | import subprocess 30 | import uuid 31 | 32 | class Colors(): 33 | INFO = '\033[94m' 34 | SUCCESS = '\033[92m' 35 | WARNING = '\033[93m' 36 | BOLD = '\033[1m' 37 | END = '\033[0m' 38 | 39 | class YdlPlugin(BeetsPlugin): 40 | """A plugin for downloading music from YouTube and importing into beets. 41 | 42 | It tries to split album files if it can identify track times somewhere. 43 | """ 44 | def __init__(self, *args, **kwargs): 45 | """Set default values 46 | 47 | `self.config['youtubedl_options']` is a dict with a lot of options 48 | available from youtube-dl: https://git.io/fN0c7 49 | """ 50 | super(YdlPlugin, self).__init__() 51 | 52 | self.search_query = "https://www.youtube.com/results?search_query=" 53 | self.config_dir = config.config_dir() 54 | self.cache_dir = self.config_dir + "/ydl-cache" 55 | self.outtmpl = self.cache_dir + "/%(id)s/%(id)s.%(ext)s" 56 | 57 | # Default options 58 | self._config = { 59 | 'urls': [], 60 | 'verbose': False, 61 | 'youtubedl_options': { 62 | 'verbose': False, 63 | 'keepvideo': False, 64 | 'cachedir': self.cache_dir, 65 | 'outtmpl': self.outtmpl, 66 | 'restrictfilenames': True, 67 | 'ignoreerrors': True, 68 | 'nooverwrites': True, 69 | 'writethumbnail': True, 70 | 'quiet': True, 71 | 'usenetrc': os.path.exists( 72 | os.path.join(str(Path.home()), ".netrc")), 73 | 'format': 'bestaudio/best', 74 | 'postprocessors': [{ 75 | 'key': 'FFmpegExtractAudio', 76 | 'preferredcodec': 'mp3', 77 | 'preferredquality': '192', 78 | 'nopostoverwrites': True 79 | }] 80 | } 81 | } 82 | self._config.update(self.config) 83 | self.config = self._config 84 | 85 | # be verbose if beets is verbose 86 | if not self.config.get('verbose'): 87 | self.config['verbose'] = True 88 | 89 | def commands(self): 90 | outer_class = self 91 | 92 | def ydl_func(lib, opts, args): 93 | """Parse args and download one source at a time to pass it to 94 | beets 95 | """ 96 | for opt, value in opts.__dict__.items(): 97 | self.config[opt] = value 98 | 99 | if len(args) > 0: 100 | for arg in args: 101 | outer_class.youtubedl(lib, opts, arg) 102 | elif self.config.get('urls') is not None: 103 | if self.config.get('verbose'): 104 | print("[ydl] Falling back to default urls") 105 | for url in self.config.get('urls'): 106 | outer_class.youtubedl(lib, opts, str(url)) 107 | 108 | parser = OptionParser() 109 | parser.add_option("--no-download", action="store_false", 110 | default=True, dest="download", help="don't actually " + \ 111 | "download files, only the descriptions") 112 | parser.add_option("--no-split-files", action="store_false", 113 | default=True, dest="split_files", help="don't try " + \ 114 | "to split files when an album is identified") 115 | parser.add_option("--no-import", action="store_false", 116 | default=True, dest="import", help="do not import into " + \ 117 | "beets after downloading and processing") 118 | parser.add_option("-f", "--force-download", action="store_true", 119 | default=False, dest="force_download", help="always download " + \ 120 | "and overwrite files") 121 | parser.add_option("-k", "--keep-files", action="store_true", 122 | default=False, dest="keep_files", help="keep the files " + \ 123 | "downloaded on cache, useful for caching or bulk importing") 124 | parser.add_option("-w", "--write-dummy-mp3", action="store_true", 125 | default=False, dest="write_dummy_mp3", help="write blank " + \ 126 | "dummy mp3 files with valid ID3 information") 127 | parser.add_option("-v", "--verbose", action="store_true", 128 | dest="verbose", default=False, help="print processing " + \ 129 | "information") 130 | 131 | ydl_cmd = ui.Subcommand('ydl', parser=parser, 132 | help=u'Download music from YouTube') 133 | ydl_cmd.func = ydl_func 134 | 135 | return [ydl_cmd] 136 | 137 | def youtubedl(self, lib, opts, arg): 138 | """Calls YoutubeDL 139 | 140 | Call beets when finishes downloading the audio file. We don't implement 141 | a YoutubeDL's post processor because we want to call beets for every 142 | download, and not after downloading a lot of files. 143 | 144 | So we try to read `YoutubeDL.extract_info` entries and process them 145 | with an internal `YoutubeDL.process_ie_result` method, that will 146 | actually download the audio file. 147 | """ 148 | if self.config.get('verbose'): 149 | print("[ydl] Calling youtube-dl") 150 | 151 | youtubedl_config = self.config.get('youtubedl_options') 152 | youtubedl_config['keepvideo'] = self.config.get('keep_files') 153 | y = YoutubeDL(youtubedl_config) 154 | 155 | ie_result = y.extract_info(arg, download=False, process=False) 156 | 157 | if ie_result is None: 158 | print("[ydl] Error: Failed to fetch file information.") 159 | print("[ydl] If this is not a network problem, try upgrading") 160 | print("[ydl] beets-ydl:") 161 | print("[ydl]") 162 | print("[ydl] pip install -U beets-ydl") 163 | print("[ydl]") 164 | exit(1) 165 | 166 | if 'entries' in ie_result: 167 | entries = ie_result['entries'] 168 | else: 169 | entries = [ie_result] 170 | 171 | download = self.config.get('download') 172 | if self.config.get('force_download'): 173 | download = True 174 | 175 | for entry in entries: 176 | items = [x for x in lib.items('ydl:' + entry['id'])] + \ 177 | [x for x in lib.albums('ydl:' + entry['id'])] 178 | 179 | if len(items) > 0 and not self.config.get('force_download'): 180 | if self.config.get('verbose'): 181 | print('[ydl] Skipping item already in library:' + \ 182 | ' %s [%s]' % (entry['title'], entry['id'])) 183 | continue 184 | 185 | if self.config.get('verbose') and not download: 186 | print("[ydl] Skipping download: " + entry['id']) 187 | 188 | data = y.process_ie_result(entry, download=download) 189 | if data: 190 | ie_result.update(data) 191 | self.info = ie_result 192 | self.process_item() 193 | else: 194 | print("[ydl] No data for " + entry['id']) 195 | 196 | def is_in_library(self, entry, lib): 197 | """Check if an `entry` is already in the `lib` beets library 198 | """ 199 | if lib.items(('ydl_id', entry['id'])): 200 | return True 201 | else: 202 | return False 203 | 204 | def get_file_path(self, ext): 205 | return self.outtmpl % { 'id': self.info.get('id'), 'ext': ext } 206 | 207 | def is_album(self): 208 | return self.fullalbum_stripped or len(self.tracks) > 1 209 | 210 | def process_item(self): 211 | """Called after downloading source with YoutubeDL 212 | 213 | From here on, the plugin assumes its state according to what 214 | is being downloaded. 215 | """ 216 | print('[ydl] Processing item: ' + self.info.get('title')) 217 | 218 | ext = self.config.get('youtubedl_options')\ 219 | ['postprocessors'][0]['preferredcodec'] 220 | self.audio_file = self.get_file_path(ext) 221 | self.outdir, self.audio_file_ext = os.path.splitext(self.audio_file) 222 | self.outdir = os.path.dirname(self.outdir) 223 | 224 | if self.config.get('verbose') and \ 225 | self.config.get('download') and \ 226 | not os.path.exists(self.audio_file): 227 | print('[ydl] Error: Audio file not found: ' + self.audio_file) 228 | exit(1) 229 | 230 | self.strip_fullalbum() 231 | self.extract_tracks() 232 | 233 | if not self.is_album(): 234 | self.set_single_file_data() 235 | 236 | if self.config.get('verbose'): 237 | print(self.get_tracklist()) 238 | 239 | if self.config.get('write_dummy_mp3'): 240 | self.write_dummy_mp3() 241 | 242 | if self.config.get('verbose') and self.is_album(): 243 | print("[ydl] URL is identified as an album") 244 | else: 245 | print("[ydl] URL is identified as a singleton") 246 | 247 | if self.config.get('split_files') \ 248 | and not self.config.get('write_dummy_mp3') \ 249 | and self.is_album(): 250 | self.split_file() 251 | 252 | if self.config.get('import'): 253 | beet_cmd = self.get_beet_cmd() 254 | if self.config.get('verbose'): 255 | print("[ydl] Running beets: " + ' '.join(beet_cmd)) 256 | subprocess.run(beet_cmd) 257 | elif self.config.get('verbose'): 258 | print('[ydl] Skipping import') 259 | 260 | if not self.config.get('keep_files'): 261 | self.clean() 262 | elif self.config.get('verbose') and self.config.get('keep_files'): 263 | print('[ydl] Keeping downloaded files on ' + self.outdir) 264 | 265 | def get_beet_cmd(self): 266 | beet_cmd = ['beet'] 267 | 268 | if os.getenv('BEETS_ENV') == 'develop': 269 | beet_cmd.extend(['-c', 'env.config.yml']) 270 | 271 | if self.config.get('verbose'): 272 | beet_cmd.extend(['-v']) 273 | 274 | beet_cmd.extend(['import', '--set', 'ydl=' + self.info.get('id')]) 275 | 276 | if not self.is_album(): 277 | beet_cmd.extend(['--singletons']) 278 | 279 | if os.path.exists(self.outdir): 280 | beet_cmd.extend([self.outdir]) 281 | else: 282 | beet_cmd.extend([self.audio_file]) 283 | 284 | return beet_cmd 285 | 286 | def __exit__(self, exc_type, exc_value, traceback): 287 | cache_size = self.config.get('cache_dir') 288 | if cache_size > 0: 289 | print("[ydl] " + cache_size + " in cache") 290 | 291 | if self.config.get('verbose'): 292 | print('[ydl] Leaving') 293 | 294 | def clean(self): 295 | """Deletes everything related to the present run. 296 | """ 297 | files = glob.glob(self.outdir + '*') 298 | for f in files: 299 | if os.path.isdir(f): 300 | shutil.rmtree(f) 301 | else: 302 | os.remove(f) 303 | 304 | def strip_fullalbum(self): 305 | """Will remove '[Full Album]' entries on video title. 306 | """ 307 | regex = re.compile(r'\S*?(fullalbum|full[^a-z]+album|album)\S*?', 308 | re.IGNORECASE) 309 | title = regex.sub('', self.info.get('title')) 310 | if title != self.info.get('title'): 311 | self.info['title'] = title 312 | self.fullalbum_stripped = True 313 | 314 | self.fullalbum_stripped = False 315 | 316 | def split_file(self): 317 | """Split downloaded file into multiple tracks 318 | 319 | Tries to parse metadata from the video description. 320 | """ 321 | # @TODO check for overwrites according to options 322 | 323 | if self.config.get('verbose'): 324 | print("[ydl] Splitting tracks") 325 | 326 | cmds = [] 327 | ffmpeg_cmd = ['ffmpeg', '-y', '-i', self.audio_file, 328 | '-acodec', 'copy'] 329 | 330 | if not os.path.exists(self.outdir): 331 | os.mkdir(self.outdir) 332 | 333 | file_id = os.path.basename(os.path.normpath(self.outdir)) 334 | 335 | for track in self.tracks: 336 | opts = ['-ss', str(track['start']), '-to', str(track['end'])] 337 | 338 | for k in track.keys(): 339 | opts.extend(['-metadata', '%s=%s' % (k, track[k])]) 340 | 341 | outfile = '%s/%03d-%s%s' % (self.outdir, 342 | track['track'], file_id, self.audio_file_ext) 343 | opts.extend([outfile]) 344 | 345 | cmds.append(ffmpeg_cmd + opts) 346 | 347 | if len(cmds) > 0 and os.path.exists(self.audio_file): 348 | print("[ydl] Running ffmpeg") 349 | for cmd in cmds: 350 | subprocess.run(cmd, stdout=subprocess.PIPE, 351 | stderr=subprocess.PIPE) 352 | os.remove(self.audio_file) 353 | 354 | def clean_str(self, s): 355 | s = re.sub(r'[^0-9a-zA-Z ]', '', s) 356 | s = re.sub(r'\s+', ' ', s) 357 | s = s.strip() 358 | 359 | return s 360 | 361 | def get_common_metadata(self): 362 | """Tries to translate metadata parsed from video description into file 363 | metadata. 364 | 365 | Will also remove years from title. 366 | """ 367 | metadata = {} 368 | 369 | year = self.get_year() 370 | if year is not None: 371 | metadata['year'] = year 372 | 373 | metadata['artist'], metadata['album'] = self.parse_title() 374 | 375 | return metadata 376 | 377 | def get_year(self): 378 | year_regex = r'[^\S]?([12][0-9]{3})[^\S]?' 379 | regex = re.compile(year_regex) 380 | matches = regex.match(self.info.get('title')) 381 | if matches: 382 | self.info['title'] = re.sub(year_regex, '', self.info['title']) 383 | year = matches.group(1) 384 | return year 385 | 386 | return None 387 | 388 | def parse_title(self): 389 | """Parse the title trying to find an "Artist - Album" pattern 390 | """ 391 | seps_regex = r'(.*?)[-~|*%#](.*)' 392 | regex = re.compile(seps_regex) 393 | 394 | if regex.match(self.info.get('title')): 395 | art_alb = regex.findall(self.info.get('title')) 396 | first = art_alb[0][0] 397 | second = art_alb[0][1] 398 | 399 | # in beets we trust 400 | else: 401 | first = self.info.get('title') 402 | second = self.info.get('title') 403 | 404 | return (self.clean_str(first), self.clean_str(second)) 405 | 406 | def to_seconds(self, time): 407 | """Convert MM:SS to seconds 408 | """ 409 | secs = 0 410 | parts = [int(s) for s in time.split(':')] 411 | secs = parts[len(parts)-1] 412 | secs += parts[len(parts)-2] * 60 413 | if len(parts) > 2: 414 | secs += parts[len(parts)-3] * 3600 415 | 416 | return secs 417 | 418 | def to_hms(self, seconds): 419 | """Convert seconds to HH:MM:SS 420 | """ 421 | seconds, sec = divmod(float(seconds), 60) 422 | hr, min = divmod(seconds, 60) 423 | 424 | return "%d:%02d:%02d" % (hr, min, sec) 425 | 426 | def extract_tracks(self): 427 | """Try different methods to extract tracks metadata 428 | """ 429 | print("[ydl] Extracting tracks metadata") 430 | 431 | self.tracks = [] 432 | if os.path.exists(self.audio_file): 433 | self.tracks = self.extract_tracks_from_chapters() 434 | elif self.config.get('verbose'): 435 | print("[ydl] Audio file not found, won't look for chapters") 436 | 437 | if len(self.tracks) == 0: 438 | if self.config.get('verbose'): 439 | print("[ydl] Chapters not found, trying video description") 440 | self.tracks = self.extract_tracktimes_from_string( 441 | self.info.get('description')) 442 | 443 | if len(self.tracks) > 0: 444 | self.extract_tracks_cleanup() 445 | 446 | common_metadata = self.get_common_metadata() 447 | 448 | for i in range(0, len(self.tracks) - 1): 449 | self.tracks[i].update(common_metadata) 450 | 451 | def get_tracklist(self): 452 | output = [] 453 | if len(self.tracks) > 1: 454 | for track in self.tracks: 455 | output.append("[ydl] %03d: %s (%s - %s)" % ( 456 | track['track'], 457 | track['title'], 458 | self.to_hms(track['start']), 459 | self.to_hms(track['end']))) 460 | else: 461 | for track in self.tracks: 462 | output.append("[ydl] %s (%s - %s)" % ( 463 | track['title'], 464 | self.to_hms(track['start']), 465 | self.to_hms(track['end']))) 466 | 467 | return "\n".join(output) 468 | 469 | def extract_tracks_from_chapters(self): 470 | """Read chapters tags on file to find times and metadata 471 | """ 472 | tracks = [] 473 | ffprobe_cmd = ['ffprobe', '-i', self.audio_file] 474 | info = str(subprocess.run(ffprobe_cmd, 475 | stderr=subprocess.PIPE).stderr) 476 | 477 | chapters_regex = r'\s+Chapter\s+' + \ 478 | r'#(?P[:0-9]+).*?' + \ 479 | r'start\s+(?P[0-9.]+).*?' + \ 480 | r'end\s+(?P[0-9.]+).*?' + \ 481 | r'Metadata:(?P\\n' + \ 482 | r'\s+(?P\S+)\s+:' + \ 483 | r'\s+(?P.*?)' + \ 484 | r'\\n)+?' 485 | regex = re.compile(chapters_regex, re.DOTALL) 486 | for fields in re.findall(regex, info): 487 | trackno = int(re.sub(r'[^0-9]', '', fields[0])) + 1 488 | 489 | track = { 490 | 'track': trackno, 491 | 'start': fields[1], 492 | 'end': fields[2], 493 | } 494 | index = 4 495 | while index < len(fields) - 1: 496 | track[self.clean_str(fields[index])] = \ 497 | self.clean_str(fields[index + 1]) 498 | index += 2 499 | tracks.append(track) 500 | 501 | return tracks 502 | 503 | def extract_tracktimes_from_string(self, s): 504 | """Try to find HH:MM patterns as track times on description 505 | """ 506 | tracks_regex = \ 507 | r'^(.*?)(?P