├── shiradl ├── __init__.py ├── __main__.py ├── util.py ├── mbtag.py ├── tagging.py ├── metadata.py ├── cli.py ├── dl.py └── musicbrainz.py ├── .gitignore ├── pyproject.toml ├── mbtag.md ├── LICENSE ├── .github └── workflows │ └── main.yml ├── README.md └── logo.svg /shiradl/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.7.1" 2 | -------------------------------------------------------------------------------- /shiradl/__main__.py: -------------------------------------------------------------------------------- 1 | from .cli import cli 2 | 3 | cli() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.json 3 | *.opus 4 | *.mp3 5 | *.wav 6 | *.flac 7 | *.sqlite -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "shiradl" 3 | description = "Download music from YouTube, YouTube Music and Soundcloud, with great metadata and little effort." 4 | requires-python = ">=3.12" 5 | authors = [{ name = "KraXen72" }, { name = "glomatico" }] 6 | dependencies = [ 7 | "click", 8 | "yt-dlp >= 2025.03.27", 9 | "ytmusicapi >= 1.10.2", 10 | "mediafile", 11 | "pillow", 12 | "requests_cache", 13 | "python-dateutil" 14 | ] 15 | readme = "README.md" 16 | dynamic = ["version"] 17 | 18 | [project.urls] 19 | repository = "https://github.com/KraXen72/shira" 20 | 21 | [build-system] 22 | requires = ["flit_core"] 23 | build-backend = "flit_core.buildapi" 24 | 25 | [project.scripts] 26 | shiradl = "shiradl.cli:cli" 27 | mbtag = "shiradl.mbtag:mbtag_cli" 28 | -------------------------------------------------------------------------------- /mbtag.md: -------------------------------------------------------------------------------- 1 | # mbtag 2 | work in progress 3 | 4 | tagging utility which is part of shira 5 | primary use is to add MusicBrainz ID tags to existing songs which were not downlaoded by shira 6 | 7 | docs TBD 8 | just use the main cli for now, or explore the code 9 | 10 | ``` 11 | Usage: python -m mbtag [OPTIONS] INPUT_PATH 12 | 13 | Options: 14 | -c, --fetch-complete Fetch from MusicBrainz even if has mb_releasetrackid, 15 | mb_releasegroupid, mb_artistid, mb_albumartistid 16 | present. 17 | -p, --fetch-partial Fetch from MusicBrainz even if has some mb_* tags 18 | present. 19 | -d, --dry-run Don't write to any files, just print out the mb_* tags 20 | -g, --debug Prints out extra information for debugging. Does not 21 | imply --dry-run. 22 | --help Show this message and exit. 23 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 KraXen72 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: publish 2 | 3 | # Controls when the workflow will run 4 | on: 5 | 6 | # Workflow will run when a release has been published for the package 7 | release: 8 | branches: 9 | - master 10 | types: 11 | - published 12 | 13 | # Allows you to run this workflow manually from the Actions tab 14 | workflow_dispatch: 15 | 16 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel 17 | jobs: 18 | 19 | # This workflow contains a single job called "publish" 20 | publish: 21 | 22 | # The type of runner that the job will run on 23 | runs-on: ubuntu-latest 24 | 25 | # Steps represent a sequence of tasks that will be executed as part of the job 26 | steps: 27 | 28 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 29 | - uses: actions/checkout@v3 30 | 31 | - name: Set up Python 3.9 32 | uses: actions/setup-python@v3 33 | with: 34 | python-version: 3.9 35 | cache: pip 36 | 37 | - name: To PyPI using Flit 38 | uses: AsifArmanRahman/to-pypi-using-flit@v1 39 | with: 40 | password: ${{ secrets.PYPI_API_TOKEN }} 41 | -------------------------------------------------------------------------------- /shiradl/util.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import math 4 | from os import path 5 | 6 | longest_line2 = -1 7 | 8 | class TermColors: 9 | HEADER = "\033[95m" 10 | OKBLUE = "\033[94m" 11 | OKCYAN = "\033[96m" 12 | OKGREEN = "\033[92m" 13 | WARNING = "\033[93m" 14 | FAIL = "\033[91m" 15 | ENDC = "\033[0m" 16 | BOLD = "\033[1m" 17 | UNDERLINE = "\033[4m" 18 | 19 | def print_color(color: TermColors, text: str): 20 | print(f"{color}{text}{TermColors.ENDC}") 21 | 22 | 23 | def pprint(val, no_null = False): 24 | """mediafile-specific pretty print""" 25 | if not isinstance(val, dict): 26 | print(val) 27 | return 28 | d = {} 29 | for [k, v] in val.items(): 30 | if isinstance(v, bytes): 31 | decoded = "" 32 | try: 33 | decoded = v.decode("utf-8") 34 | except UnicodeDecodeError: 35 | decoded = "" 36 | d[k] = decoded 37 | elif isinstance(v, datetime.date): 38 | d[k] = f"date({v.isoformat()})" 39 | elif v is None: 40 | if no_null: 41 | continue 42 | else: 43 | d[k] = "null" 44 | else: 45 | try: 46 | json.dumps(v) 47 | d[k] = v 48 | except TypeError: 49 | d[k] = f"{str(type(v))} is/contains non-serializable" 50 | print(json.dumps(d, indent=2)) 51 | 52 | def end_path(fp: str, segments = 3): 53 | parts = fp.split(path.sep) 54 | return path.sep.join(parts[-segments:]) 55 | 56 | def progprint(curr: int, total: int, width = 10, message = "", end = "\r"): 57 | global longest_line2 58 | perc_factor = (curr / total) 59 | scaled_perc = math.floor(width * perc_factor) 60 | if curr == total: 61 | scaled_perc = width 62 | perc_factor = 1 63 | remainder = width - scaled_perc 64 | line2 = f" {message}" if message.strip() != "" else "" 65 | if len(line2) > longest_line2: 66 | longest_line2 = len(line2) 67 | len_diff = longest_line2 - len(line2) 68 | if len_diff > 0: # flush previous line2 69 | line2 += " " * len_diff 70 | 71 | print(f"[{'=' * scaled_perc}{' ' * remainder}] {(perc_factor): 5.0%}{line2}", end=end) 72 | -------------------------------------------------------------------------------- /shiradl/mbtag.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import click 5 | from mediafile import MediaFile, FileTypeError 6 | 7 | from .musicbrainz import MBSong 8 | from .util import TermColors, end_path, pprint, progprint 9 | 10 | # Define supported extensions list using the keys from the TYPES dictionary 11 | SONG_EXTS = ["mp3", "aac", "alac", "ogg", "opus", "flac", "ape", "wv", "mpc", "asf", "aiff", "dsf", "wav"] 12 | MBID_TAG_KEYS = ["mb_releasetrackid", "mb_releasegroupid", "mb_artistid", "mb_albumartistid"] 13 | 14 | # Function to check if a file is a supported song file 15 | def is_supported_song_file(filename): 16 | ext = os.path.splitext(filename)[1][1:] 17 | if ext.lower() in SONG_EXTS: 18 | return True 19 | try: 20 | MediaFile(filename) 21 | return True 22 | except FileTypeError: 23 | return False 24 | 25 | 26 | def process_directory(directory_or_file: click.Path, fetch_complete: bool, fetch_partial: bool, dry_run: bool, debug: bool): 27 | if not os.path.exists(str(directory_or_file)): 28 | print(f"[error]: Path '{directory_or_file}' does not exist.") 29 | return 30 | if os.path.isfile(str(directory_or_file)): 31 | process_song(str(directory_or_file), 0, 1, fetch_complete, fetch_partial, dry_run, debug) 32 | print() 33 | return 34 | for root, _, files in os.walk(str(directory_or_file)): 35 | for i in range(len(files)): 36 | f = files[i] 37 | filepath = os.path.join(root, f) 38 | if not is_supported_song_file(filepath): 39 | continue 40 | try: 41 | process_song(filepath, i, len(files), fetch_complete, fetch_partial, dry_run, debug) 42 | # print() 43 | except Exception as e: 44 | print(f"Error processing song '{filepath}':") 45 | print(e) 46 | progprint(100, 100, message=f"Processed all files in {end_path(root, 2)}") 47 | print() 48 | 49 | 50 | def has_all_mbid_tags(handle: MediaFile): 51 | """these files are skipped alltogether""" 52 | handle_dict = handle.as_dict() 53 | return all(handle_dict.get(key) is not None for key in MBID_TAG_KEYS) 54 | 55 | 56 | def no_of_mbid_tags(handle: MediaFile): 57 | handle_dict = handle.as_dict() 58 | return sum([handle_dict.get(key) is not None for key in MBID_TAG_KEYS]) 59 | 60 | def red_if_none(val: str | None): 61 | if val is None: 62 | return TermColors.FAIL + "None" + TermColors.ENDC 63 | else: 64 | return val 65 | 66 | 67 | def process_song(filepath: str, ind: int, total: int, fetch_complete: bool, fetch_partial: bool, dry_run=False, debug=False): 68 | handle = MediaFile(filepath) 69 | has_all = has_all_mbid_tags(handle) 70 | has_some = no_of_mbid_tags(handle) 71 | status = f"[song] {end_path(filepath, 2)}, has_all: {has_all}, has_some: {has_some}" 72 | progprint(ind, total, message=status) 73 | # pprint(handle.as_dict(), True) 74 | 75 | # by default, partials and completes are not fetched 76 | continue_partials = has_some == 0 or (has_some > 0 and (fetch_partial or fetch_complete)) 77 | continue_complete = not has_all or (has_all and fetch_complete) 78 | # print(f"continue_partials: {continue_partials}, continue_complete: {continue_complete} ") 79 | 80 | if not (continue_partials and continue_complete): 81 | msg = f"[skipped] check args for fetching complete or partial songs. c:{int(not continue_complete)}, p:{int(not continue_partials)} " 82 | progprint(ind, total, message=msg) 83 | # print(msg) 84 | return 85 | if handle.title is None or handle.artist is None: 86 | msg = "[skipped] 'title' and 'artist' tags are required to search MusicBrainz " 87 | progprint(ind, total, message=msg) 88 | # print(msg) 89 | return 90 | # The fallback likely won't work but i cba to fix it properly for now 91 | formb_album = str(handle.album) if handle.album is not None else f"{handle.title} (Single)" 92 | 93 | mb = MBSong( 94 | title=str(handle.title), 95 | artist=str(handle.artist), 96 | album=formb_album, 97 | skip_clean_title=True, # this is only useful for youtube songs with messed up titles 98 | debug=debug 99 | ) 100 | mb.fetch_song() 101 | 102 | if debug: 103 | pprint(mb.get_mbid_tags()) 104 | if dry_run: 105 | msg = "[skipped] didn't write due to --dry-run " 106 | # progprint(ind, total, message=msg) 107 | print(msg) 108 | print(mb.get_mb_tags()) 109 | print(json.dumps(mb.get_mbid_tags(), indent=2)) 110 | return 111 | else: 112 | for [k, v] in mb.get_mbid_tags().items(): 113 | setattr(handle, k, v) 114 | handle.save() 115 | ptags = mb.get_mb_tags() 116 | msg = "" 117 | if ptags is not None: 118 | if ptags["artist"] is None or ptags["title"] is None or ptags["album"] is None: 119 | pprint(ptags) 120 | msg = f"[ok] written IDs for result: {red_if_none(ptags['artist'])} - {red_if_none(ptags['title'])} (on {red_if_none(ptags['album'])}) " 121 | else: 122 | msg = "[ok] written! " 123 | # print(msg) 124 | progprint(ind, total, message=msg) 125 | 126 | @click.command() 127 | @click.argument("input_path", type=click.Path(exists=True, file_okay=True, resolve_path=True)) 128 | @click.option("--fetch-complete", "-c", is_flag=True, help=f"Fetch from MusicBrainz even if has {", ".join(MBID_TAG_KEYS)} present.") 129 | @click.option("--fetch-partial", "-p", is_flag=True, help="Fetch from MusicBrainz even if has some mb_* tags present.") 130 | @click.option("--dry-run", "-d", is_flag=True, help="Don't write to any files, just print out the mb_* tags") 131 | @click.option("--debug", "-g", is_flag=True, help="Prints out extra information for debugging. Does not imply --dry-run.") 132 | def mbtag_cli(input_path: click.Path, fetch_complete=False, fetch_partial=False, dry_run=False, debug=False): 133 | process_directory(input_path, fetch_complete, fetch_partial, dry_run, debug) 134 | 135 | if __name__ == "__main__": 136 | mbtag_cli() 137 | -------------------------------------------------------------------------------- /shiradl/tagging.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import os 5 | from io import BytesIO 6 | from pathlib import Path 7 | from statistics import mean, stdev 8 | from typing import NotRequired, TypedDict 9 | 10 | from dateutil import parser 11 | from mediafile import Image as MFImage 12 | from mediafile import ImageType, MediaFile 13 | from PIL import Image, ImageFilter, ImageOps 14 | from requests_cache import CachedSession 15 | 16 | AVG_THRESHOLD = 10 17 | CHANNEL_THRESHOLD = 15 18 | MV_SEPARATOR = "/"#" & " # TODO make this configurable 19 | MV_SEPARATOR_VISUAL = " & " 20 | req = CachedSession("shira", expire_after=3600, use_cache_dir=True) 21 | 22 | class Tags(TypedDict): 23 | title: str 24 | album: str 25 | artist: str | list[str] 26 | albumartist: str | list[str] 27 | track: int 28 | tracktotal: int 29 | year: str 30 | date: str 31 | cover_url: str 32 | cover_bytes: NotRequired[bytes] 33 | rating: NotRequired[int] 34 | comments: NotRequired[str] 35 | lyrics: NotRequired[str] 36 | 37 | fallback_mv_keys = ["artist", "albumartist"] 38 | 39 | def metadata_applier(tags: Tags, fixed_location: Path, exclude_tags: list[str], fallback_mv = True): 40 | """set fallback_mv = True until auxio supports proper multi-value m4a tags from mutagen""" 41 | handle = MediaFile(fixed_location) 42 | handle.delete() 43 | # print({**tags, "cover_bytes": ""}) 44 | for k, v in tags.items(): 45 | if k in exclude_tags or k in ["cover_url", "cover_bytes"]: 46 | continue 47 | if k == "date": 48 | v = parser.isoparse(str(v)).date() 49 | if isinstance(v, list): 50 | if not fallback_mv or (k not in fallback_mv_keys): 51 | setattr(handle, f"{k}s", v) # will not work for all single => multi migrations 52 | if k in fallback_mv_keys: 53 | setattr(handle, k, MV_SEPARATOR.join(v) if fallback_mv else MV_SEPARATOR_VISUAL.join(v)) 54 | else: 55 | setattr(handle, k, v) 56 | 57 | if "cover" not in exclude_tags: 58 | cover_bytes = tags.get("cover_bytes") or get_cover(tags["cover_url"]) 59 | handle.images = [ MFImage(data=cover_bytes, desc="Cover", type=ImageType.front) ] 60 | 61 | handle.disc = 1 62 | handle.disctotal = 1 63 | handle.save() 64 | 65 | # cover shenanigans 66 | 67 | @functools.lru_cache 68 | def get_cover(url): 69 | return req.get(url).content 70 | 71 | def get_cover_local(file_path: Path, id_or_url: str, is_soundcloud: bool): 72 | """ 73 | reads a local image as bytes. 74 | if given a directory, finds the matching image by filename stem matching id_or_url 75 | """ 76 | if file_path.is_file(): 77 | return file_path.read_bytes() 78 | elif file_path.is_dir(): 79 | for filename in os.listdir(file_path): 80 | fp = file_path / filename 81 | if (not fp.is_file()) or (fp.suffix.lower() not in [".jpg", ".jpeg", ".png"]): 82 | continue 83 | if (is_soundcloud and id_or_url.split("/")[-1] == fp.stem) or (is_soundcloud is False and id_or_url == fp.stem): 84 | return fp.read_bytes() 85 | return None 86 | 87 | def get_dominant_color(pil_img: Image.Image) -> tuple[int, int, int, int]: 88 | img = pil_img.copy().convert("RGBA") 89 | img = img.resize((1, 1), resample=Image.Resampling.NEAREST) 90 | 91 | pixel = img.getpixel((0, 0)) 92 | 93 | # Explicitly ensure the return type is always Tuple[int, int, int, int] 94 | if isinstance(pixel, tuple) and len(pixel) == 4: 95 | return pixel 96 | else: 97 | return (0,0,0,255) 98 | 99 | def sample_image_corners(rgb_image, width, height, border_offset = 50): 100 | sample_colors = [] 101 | regions = [ 102 | (border_offset, border_offset), # topleft 103 | (width - border_offset, border_offset), #topright 104 | (border_offset, height - border_offset), #botleft 105 | (width - border_offset, height - border_offset), #botright 106 | # (border_slice_center, height//2), #left center 107 | # (width//2 + height//2 + border_slice_center, height//2) #right center 108 | ] 109 | for sx, sy in regions: 110 | r, g, b = rgb_image.getpixel((sx, sy)) 111 | sample_colors.append((r, g, b)) 112 | return sample_colors 113 | 114 | def determine_image_crop(image_bytes: bytes): 115 | """ 116 | samples 4 pixels near the corners and 2 from centers of side slices of the thumbnail (which is first smoothed and reduced to 64 colors) 117 | 118 | returns 'crop' if average of standard deviation of r, g and b color channels 119 | from each sample point is lower than a than a threshold, otherwise returns 'pad' 120 | """ 121 | pil_img = Image.open(BytesIO(image_bytes)) 122 | filt_image = pil_img.filter(ImageFilter.SMOOTH).convert("P", palette=Image.Palette.ADAPTIVE, colors=64) 123 | rgb_filt_image = filt_image.convert("RGB") 124 | 125 | width, height = rgb_filt_image.size 126 | sample_colors50 = sample_image_corners(rgb_filt_image, width, height, 50) 127 | sample_colors0 = sample_image_corners(rgb_filt_image, width, height, 1) 128 | 129 | reds, greens, blues = [], [], [] 130 | for r,g,b in sample_colors50: 131 | reds.append(r) 132 | greens.append(g) 133 | blues.append(b) 134 | 135 | dev_red = stdev(reds) 136 | dev_green = stdev(greens) 137 | dev_blue = stdev(blues) 138 | avg_dev = mean([dev_red, dev_green, dev_blue]) 139 | 140 | # if 4 true corners are 100% equal, fill with that. 141 | # TODO later, crop the borders off of a black-bordered thumbnail for real cropping 142 | fill_recc = sample_colors0[0] if len(set(sample_colors0)) == 1 else None 143 | # print("average:", avg_dev, "colors:", dev_red, dev_green, dev_blue) 144 | 145 | if avg_dev < AVG_THRESHOLD and dev_red < CHANNEL_THRESHOLD and dev_green < CHANNEL_THRESHOLD and dev_blue < CHANNEL_THRESHOLD: 146 | return "crop", fill_recc 147 | else: 148 | return "pad", fill_recc 149 | 150 | def get_1x1_cover(url: str, temp_location: Path, uniqueid: str, cover_format = "JPEG", cover_crop_method = "auto"): 151 | image_bytes = req.get(url).content 152 | pil_img = Image.open(BytesIO(image_bytes)) 153 | 154 | width, height = pil_img.size 155 | aspect_ratio = width / height 156 | 157 | if aspect_ratio == 1: 158 | return image_bytes 159 | 160 | width, height = pil_img.size 161 | recc_fill_color = None 162 | 163 | if cover_crop_method == "auto": 164 | cover_crop_method, recc_fill_color = determine_image_crop(image_bytes) 165 | 166 | if cover_crop_method == "crop": 167 | img_half = round(width / 2) 168 | rect_half = round(height / 2) 169 | pil_img = pil_img.crop((img_half - rect_half, 0, img_half + rect_half, height)) 170 | else: 171 | dominant_color = get_dominant_color(pil_img) if recc_fill_color is None else recc_fill_color 172 | pil_img = ImageOps.pad(pil_img, (width, width), color=dominant_color, centering=(0.5, 0.5)) 173 | 174 | output_bytes = BytesIO() 175 | pil_img.save(output_bytes, format=cover_format) 176 | output_bytes.seek(0) 177 | 178 | return output_bytes.read() 179 | -------------------------------------------------------------------------------- /shiradl/metadata.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import random 3 | import re 4 | from collections import Counter 5 | from pathlib import Path 6 | 7 | from requests_cache import CachedSession 8 | 9 | from .tagging import Tags, get_1x1_cover 10 | 11 | TIGER_SINGLE = "tiger:is_single:true" 12 | req = CachedSession("shira", expire_after=60, use_cache_dir=True) 13 | 14 | def parse_datestring(datestr: str): 15 | """parse YYYYMMDD or YYYY-MM-DD into { year: str, month: str, day: str }""" 16 | if re.match(r"^\d{8}$", datestr): 17 | return { "year": datestr[0:4], "month": datestr[4:6], "day": datestr[6:8] } 18 | elif "-" in datestr and re.match(r"^\d{4}-\d{2}-\d{2}$", datestr): 19 | parts = datestr.split("-") 20 | return { "year": parts[0], "month": parts[1], "day": parts[2] } 21 | else: 22 | raise Exception(f"parse_datesting: unknown format of '{datestr}' - use YYYY-MM-DD or YYYYMMDD") 23 | 24 | 25 | def get_year(track: dict[str, str | int], ytmusic_album: dict[str, str | int] | None = None): 26 | """:returns release_year, release_date""" 27 | date = { 28 | "day": "01", 29 | "month": "01", 30 | "year": "" 31 | } 32 | release_date = "" 33 | release_year = "" 34 | 35 | upload_date = track.get("release_date") or track.get("upload_date") 36 | upload_date = str(upload_date) if upload_date is not None else None 37 | 38 | if upload_date: # YYYYMMDD 39 | date = parse_datestring(upload_date) 40 | elif ytmusic_album is not None: 41 | date["year"] = str(ytmusic_album.get("year") or track.get("release_year")) 42 | 43 | release_year = date["year"] 44 | release_date = datetime.datetime(day=int(date["day"]), month=int(date["month"]), year=int(date["year"])).isoformat() + "Z" 45 | 46 | return release_year, release_date 47 | 48 | def dash_split(info, title_key: str, obj): 49 | split_title = info[title_key].split(" - ") 50 | 51 | classic_ordering = True 52 | for keyword in ["animatic", "remix"]: 53 | if keyword in info[title_key].lower(): 54 | obj["artist"].append(info["channel"]) 55 | obj["title"].append(split_title[0] if keyword in split_title[1].lower() else split_title[0]) 56 | classic_ordering = False 57 | break 58 | if classic_ordering: 59 | obj["artist"].append(split_title[0]) 60 | obj["title"].append(split_title[1]) 61 | 62 | return obj 63 | 64 | 65 | def smart_tag(list_of_keys: list[str], data_obj: dict, additional_values: list[str]): 66 | """ 67 | counts how many times each value occurs and returns the value that occurs the most 68 | """ 69 | tags = additional_values if additional_values is not None else [] 70 | for item in list_of_keys: 71 | if item in data_obj: 72 | tags.append(data_obj[item]) 73 | 74 | for i, tag in enumerate(tags): 75 | if isinstance(tag, int): 76 | tags[i] = str(tag) 77 | 78 | # filter out none and 'null' 79 | cleaned_tags = list(filter(lambda x: x is not None and x != "null", tags)) 80 | 81 | counts = Counter(cleaned_tags) # count how many times a string occurs in the tags list 82 | counts_entries = list(counts.items()) 83 | sorted_counts = sorted(counts_entries, key = lambda x: x[1]) # sort it (ascending) 84 | dehashed_counts = list(reversed(sorted_counts)) # reverse (descending) 85 | 86 | top_result = dehashed_counts[0][0] 87 | 88 | # resolve conficlics 89 | if len(dehashed_counts) > 1 and dehashed_counts[0][1] == dehashed_counts[1][1]: 90 | second_result = dehashed_counts[1][0] 91 | 92 | # for example if years look like this: [('2017', 1), ({'year': '2017', 'month': '10', 'day': '19'}, 1)] 93 | if isinstance(top_result, str) and isinstance(second_result, dict): 94 | top_result, second_result = second_result, top_result 95 | 96 | return top_result, cleaned_tags 97 | 98 | # site extractors 99 | def youtube_extractor(info): 100 | md_keys = { "title": ["title", "track", "alt_title"], "artist": ["artist", "channel", "creator"], "albumartist": [], "album": ["album"] } 101 | add_values = { "title": [], "artist": [], "albumartist": [], "album": [], "year": [], } 102 | 103 | # video title is: Artist - Title format 104 | if info["title"].count(" - ") == 1: 105 | add_values = dash_split(info, "title", add_values) 106 | if info["fulltitle"].count(" - ") == 1: 107 | add_values = dash_split(info, "fulltitle", add_values) 108 | 109 | # channel is: Artist - Topic was superseeded by YTMusic API 110 | 111 | return md_keys, add_values 112 | 113 | def soundcloud_extractor(info): 114 | md_keys = { "title": ["title", "fulltitle"], "artist": ["uploader"], "albumartist": ["uploader"], "album": [] } 115 | add_values = { "title": [], "artist": [], "albumartist": [], "album": [], "year": [], } 116 | 117 | return md_keys, add_values 118 | 119 | def get_youtube_maxres_thumbnail(info): 120 | # sometimes info["thumbnail"] results in the fallback youtube 404 gray thumbnail 121 | pinged_urls = [] 122 | thumbs = list(reversed(info["thumbnails"])) 123 | 124 | def ping_yt(url: str): 125 | res = req.get(str(t["url"])) 126 | pinged_urls.append(t["url"]) 127 | return res 128 | 129 | for t in thumbs: # try to get maxresdefault 130 | if t["url"] in pinged_urls: 131 | continue 132 | if t["url"].endswith("/maxresdefault.jpg") or t["url"].endswith("/maxresdefault.png"): 133 | res = ping_yt(t["url"]) 134 | if res.status_code == 404: 135 | continue 136 | return str(t["url"]) 137 | for t in thumbs: # otherwise, just take the one with the best preference but out format 138 | if t["url"] in pinged_urls: 139 | continue 140 | if t["url"].endswith(".jpg") or t["url"].endswith(".png"): 141 | res = ping_yt(t["url"]) 142 | if res.status_code == 404: 143 | continue 144 | return str(t["url"]) 145 | return str(info["thumbnail"]) 146 | 147 | # based on the original https://github.com/KraXen72/tiger 148 | def smart_metadata(info, temp_location: Path, cover_format = "JPEG", cover_crop_method = "auto"): 149 | """ 150 | grabs as much info as it can from all over the place 151 | gets the most likely tag and returns a dict 152 | """ 153 | 154 | thumbnail = get_youtube_maxres_thumbnail(info) 155 | # thumbnail = info["thumbnail"] 156 | md: Tags = { 157 | "title": "", 158 | "artist": "", 159 | "album": "", 160 | "albumartist": "", 161 | "track": 1, 162 | "tracktotal": 1, 163 | "year": "", 164 | "date": "", 165 | "cover_url": thumbnail, 166 | "cover_bytes": get_1x1_cover( 167 | thumbnail, 168 | temp_location, 169 | info.get("id") or clean_title(info.get("title")) or str(random.randint(0, 9) * "16"), 170 | cover_format, 171 | cover_crop_method 172 | ) 173 | } 174 | md_keys = { "title": [], "artist": [], "albumartist": [], "album": [], "year": [], } # keys to check from the 'info object'. site specific. 175 | add_values = { "title": [], "artist": [], "albumartist": [], "album": [], "year": [], } 176 | others = { "title": [], "artist": [], "albumartist": [], "album": [], "year": [], } 177 | 178 | domain = info["webpage_url_domain"] 179 | match domain: 180 | case "soundcloud.com": 181 | md_keys, add_values = soundcloud_extractor(info) 182 | case _: 183 | if domain != "youtube.com": 184 | print("[warning] unsupported domain:", str(domain), "using youtube extractor as fallback.") 185 | md_keys, add_values = youtube_extractor(info) 186 | 187 | md["title"], others["title"] = smart_tag(md_keys["title"], info, add_values["title"]) 188 | md["artist"], others["artist"] = smart_tag(md_keys["artist"], info, add_values["artist"]) 189 | md["albumartist"], others["albumartist"] = smart_tag(md_keys["albumartist"], info, [md["artist"]] + add_values["albumartist"]) 190 | 191 | md["title"] = clean_title(str(md["title"])) 192 | 193 | # fallback: title (Single) => album, only if there is no album yet 194 | if ("album" not in info) and len(add_values["album"]) == 0: 195 | add_values["album"].append(f"{md['title']} (Single)") 196 | 197 | md["album"], others["album"] = smart_tag(md_keys["album"], info, add_values["album"]) 198 | md["year"], md["date"] = get_year(info) 199 | 200 | if "(Single)" in md["album"]: 201 | md["comments"] = TIGER_SINGLE # TODO remove this later? 202 | 203 | return md 204 | 205 | bracket_tuples =[["[", "]"], ["【", "】"], ["「", "」"], ["(", ")"]] 206 | title_banned_chars = ["♪"] 207 | 208 | # https://stackoverflow.com/a/49986645/13342359 209 | yeet_emoji = re.compile(pattern = "[" 210 | "\U0001F600-\U0001F64F" # emoticons 211 | "\U0001F300-\U0001F5FF" # symbols & pictographs 212 | "\U0001F680-\U0001F6FF" # transport & map symbols 213 | "\U0001F1E0-\U0001F1FF" # flags (iOS) 214 | "]+", flags = re.UNICODE) 215 | 216 | def clean_title(title: str): 217 | """clean up youtube titles with regex and a lot of black magic""" 218 | 219 | for char in title_banned_chars: 220 | title = title.replace(char, "") 221 | for lb, rb in bracket_tuples: 222 | lbe, rbe = re.escape(lb), re.escape(rb) # check for all matching variations of brackets 223 | for m in re.finditer(rf"{lbe}([^{lbe}{rbe}]+){rbe}", title): 224 | subs = "" # preserve info about a song cover or it's japanese title 225 | if "cover" in m.group(0).lower() or re.match(r"^[一-龠]+|[ぁ-ゔ]+|[ァ-ヴー]+|[々〆〤ヶ]+|\s+$", m.group(1)) is not None: 226 | subs = f"[{m.group(1)}]" 227 | title = title.replace(m.group(0), subs) 228 | 229 | title = re.sub(yeet_emoji, "", title) # remove emoji 230 | title = re.sub(r"\*\b[A-Z ]+\b\*", "", title) # remove stuff like *NOW ON ALL PLATFORMS* 231 | title = re.sub(r"(\S)\[", r"\g<1>" + " [", title, flags=re.MULTILINE) # jap title whitespace fix 232 | title = re.sub(r"\s{2,}", " ", title) # multiple spaces fix 233 | return title.replace("_", "-").strip() 234 | -------------------------------------------------------------------------------- /shiradl/cli.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import shutil 4 | from http.cookiejar import LoadError as CookieLoadError 5 | from pathlib import Path 6 | 7 | import click 8 | 9 | from . import __version__ 10 | from .dl import Dl 11 | from .metadata import TIGER_SINGLE, smart_metadata 12 | from .musicbrainz import musicbrainz_enrich_tags 13 | from .tagging import get_cover_local, metadata_applier 14 | 15 | logging.basicConfig( 16 | format="[%(levelname)-8s %(asctime)s] %(message)s", 17 | datefmt="%H:%M:%S", 18 | ) 19 | 20 | EXCLUDED_PARAMS = ("urls", "config_location", "url_txt", "no_config_file", "version", "help") 21 | 22 | 23 | def write_default_config_file(ctx: click.Context): 24 | ctx.params["config_location"].parent.mkdir(parents=True, exist_ok=True) 25 | config_file = {param.name: param.default for param in ctx.command.params if param.name not in EXCLUDED_PARAMS} 26 | with open(ctx.params["config_location"], "w") as f: 27 | f.write(json.dumps(config_file, indent=4)) 28 | 29 | 30 | def no_config_callback(ctx: click.Context, param: click.Parameter, no_config_file: bool): 31 | if no_config_file: 32 | return ctx 33 | if not ctx.params["config_location"].exists(): 34 | write_default_config_file(ctx) 35 | with open(ctx.params["config_location"], "r") as f: 36 | config_file = dict(json.load(f)) 37 | for param in ctx.command.params: 38 | if config_file.get(param.name) is not None and ctx.get_parameter_source(param.name) != click.core.ParameterSource.COMMANDLINE: # type: ignore 39 | ctx.params[param.name] = param.type_cast_value(ctx, config_file[param.name]) # type: ignore 40 | return ctx 41 | 42 | 43 | @click.command() 44 | @click.argument("urls", nargs=-1, type=str, required=True) 45 | @click.option("--final-path", "-f", type=Path, default="./YouTube Music", help="Path where the downloaded files will be saved.") 46 | @click.option("--temp-path", "-t", type=Path, default="./temp", help="Path where the temporary files will be saved.") 47 | @click.option("--cookies-location", "-c", type=Path, default=None, help="Location of the cookies file.") 48 | @click.option("--ffmpeg-location", type=Path, default="ffmpeg", help="Location of the FFmpeg binary.") 49 | @click.option("--config-location", type=Path, default=Path.home() / ".shiradl" / "config.json", help="Location of the config file.") 50 | @click.option("--itag", "-i", type=str, default="140", help="Itag (audio quality).") 51 | @click.option("--cover-size", type=click.IntRange(0, 16383), default=1200, help="Size of the cover.") 52 | @click.option("--cover-format", type=click.Choice(["jpg", "png"]), default="jpg", help="Format of the cover.") 53 | @click.option("--cover-quality", type=click.IntRange(1, 100), default=94, help="JPEG quality of the cover.") 54 | @click.option("--cover-img", type=Path, default=None, help="Path to image or folder of images named video/song id") 55 | @click.option("--cover-crop", type=click.Choice(["auto", "crop", "pad"]), default="auto", help="'crop' takes a 1:1 square from the center, pad always pads top & bottom") 56 | @click.option("--template-folder", type=str, default="{albumartist}/{album}", help="Template of the album folders as a format string.") 57 | @click.option("--template-file", type=str, default="{track:02d} {title}", help="Template of the song files as a format string.") 58 | @click.option("--exclude-tags", "-e", type=str, default=None, help="List of tags to exclude from file tagging separated by commas without spaces.") 59 | @click.option("--truncate", type=int, default=60, help="Maximum length of the file/folder names.") 60 | @click.option("--log-level", "-l", type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), default="INFO", help="Log level.") 61 | @click.option("--save-cover", "-s", is_flag=True, help="Save cover as a separate file.") 62 | @click.option("--overwrite", "-o", is_flag=True, help="Overwrite existing files.") 63 | @click.option("--print-exceptions", "-p", is_flag=True, help="Print exceptions.") 64 | @click.option("--url-txt", "-u", is_flag=True, help="Read URLs as location of text files containing URLs.") 65 | @click.option("--no-config-file", "-n", is_flag=True, callback=no_config_callback, help="Don't use the config file.") 66 | @click.option("--single-folder", "-w", is_flag=True, help="Wrap singles in their own folder instead of placing them directly into artist's folder.") 67 | @click.option("--use-playlist-name", type=bool, is_flag=True, help="Uses the playlist name in the final location when downloading a playlist.") 68 | @click.version_option(__version__) 69 | @click.help_option("-h", "--help") 70 | def cli( 71 | urls: tuple[str, ...], 72 | final_path: Path, 73 | temp_path: Path, 74 | cookies_location: Path, 75 | ffmpeg_location: Path, 76 | config_location: Path, 77 | itag: str, 78 | cover_size: int, 79 | cover_format: str, 80 | cover_quality: int, 81 | cover_img: Path, 82 | cover_crop: str, 83 | template_folder: str, 84 | template_file: str, 85 | exclude_tags: str, 86 | truncate: int, 87 | log_level: str, 88 | save_cover: bool, 89 | overwrite: bool, 90 | print_exceptions: bool, 91 | url_txt: bool, 92 | no_config_file: bool, 93 | single_folder: bool, 94 | use_playlist_name: bool 95 | ): 96 | logger = logging.getLogger(__name__) 97 | logger.setLevel(log_level) 98 | if not shutil.which(str(ffmpeg_location)): 99 | logger.critical(f'FFmpeg not found at "{ffmpeg_location}"') 100 | return 101 | if cookies_location is not None and not cookies_location.exists(): 102 | logger.critical(f'Cookies file not found at "{cookies_location}"') 103 | return 104 | if url_txt: 105 | logger.debug("Reading URLs from text files") 106 | _urls = [] 107 | for url in urls: 108 | with open(url, "r") as f: 109 | _urls.extend(f.read().splitlines()) 110 | urls = tuple(_urls) 111 | logger.debug("Starting downloader") 112 | 113 | dl = Dl( 114 | final_path, 115 | temp_path, 116 | cookies_location, 117 | ffmpeg_location, 118 | itag, cover_size, 119 | cover_format, 120 | cover_quality, 121 | template_folder, 122 | template_file, 123 | exclude_tags, 124 | truncate, 125 | dump_json=log_level == "DEBUG", 126 | use_playlist_name=use_playlist_name 127 | ) 128 | download_queue = [] 129 | for i, url in enumerate(urls): 130 | try: 131 | logger.debug(f'Checking "{url}" (URL {i + 1}/{len(urls)})') 132 | download_queue.append(dl.get_download_queue(url)) 133 | except (CookieLoadError) as he: # handled exceptions 134 | logger.error(he, exc_info=False) 135 | except Exception: 136 | logger.error(f"Failed to check URL {i + 1}/{len(urls)}", exc_info=print_exceptions) 137 | logging.exception("") 138 | error_count = 0 139 | for i, url in enumerate(download_queue): 140 | for j, track in enumerate(url): 141 | logger.info(f'Downloading "{track["title"]}" (track {j + 1}/{len(url)} from URL {i + 1}/{len(download_queue)})') 142 | try: 143 | logger.debug("Getting tags") 144 | ytmusic_watch_playlist = dl.get_ytmusic_watch_playlist(track["id"]) 145 | 146 | dl.tags = None 147 | tags = None 148 | is_single = False 149 | if ytmusic_watch_playlist is None: 150 | logger.info("No results on YTMusic API, using Tigerv2 to extract metadata") 151 | tag_track = track 152 | if "webpage_url_domain" not in track: 153 | tag_track = dl.get_ydl_extract_info(track["url"]) 154 | logger.debug("Starting Tigerv2") 155 | tags = smart_metadata(tag_track, temp_path, "JPEG" if dl.cover_format == "jpg" else "PNG", cover_crop) 156 | is_single = tags.get("comments") == TIGER_SINGLE 157 | if is_single: 158 | tags["comments"] = str(track.get("webpage_url") or track.get("original_url") or track.get("url") or url) 159 | else: 160 | tags = dl.get_tags(ytmusic_watch_playlist, track) 161 | is_single = tags["tracktotal"] == 1 162 | logger.debug("Tags applied, fetching MusicBrainz Database") 163 | tags = musicbrainz_enrich_tags(tags, dl.soundcloud, dl.exclude_tags) 164 | # pprint(tags) 165 | logger.debug("Applied MusicBrainz Tags") 166 | if cover_img: 167 | local_img_bytes = get_cover_local(cover_img, track["url"] if dl.soundcloud else track["id"], dl.soundcloud) 168 | if local_img_bytes is not None: 169 | tags["cover_bytes"] = local_img_bytes 170 | logger.debug("Applied cover Image") 171 | final_location = dl.get_final_location(tags, ".mp3" if dl.soundcloud is True else ".m4a", is_single, single_folder) 172 | logger.debug(f'Final location is "{final_location}"') 173 | temp_location = dl.get_temp_location(track["id"]) 174 | if not final_location.exists() or overwrite: 175 | logger.debug(f'Downloading to "{temp_location}"') 176 | if dl.soundcloud is False: 177 | dl.download(track["id"], temp_location) 178 | else: 179 | dl.download_souncloud(track.get("original_url") or track["webpage_url"], temp_location) 180 | 181 | fixed_location = dl.get_fixed_location(track["id"]) 182 | logger.debug(f'Remuxing to "{fixed_location}"') 183 | dl.fixup(temp_location, fixed_location) 184 | logger.debug("Applying tags") 185 | metadata_applier(tags, fixed_location, dl.exclude_tags) 186 | # if dl.soundcloud is False: 187 | # tagger_m4a(tags, fixed_location, dl.exclude_tags, dl.cover_format) 188 | # else: 189 | # tagger_mp3(tags, fixed_location, dl.exclude_tags, dl.cover_format) 190 | logger.debug("Moving to final location") 191 | dl.move_to_final_location(fixed_location, final_location) 192 | else: 193 | logger.warning("File already exists at final location, skipping") 194 | if save_cover: 195 | cover_location = dl.get_cover_location(final_location) 196 | if not cover_location.exists() or overwrite: 197 | logger.debug(f'Saving cover to "{cover_location}"') 198 | dl.save_cover(tags, cover_location) 199 | else: 200 | logger.debug(f'File already exists at "{cover_location}", skipping') 201 | except Exception: 202 | error_count += 1 203 | logger.error( 204 | f'Failed to download "{track["title"]}" (track {j + 1}/{len(url)} from URL ' + f"{i + 1}/{len(download_queue)})", 205 | exc_info=print_exceptions, 206 | ) 207 | logging.exception("") 208 | finally: 209 | if temp_path.exists(): 210 | logger.debug(f'Cleaning up "{temp_path}"') 211 | dl.cleanup() 212 | logger.info(f"Done ({error_count} error(s))") 213 | 214 | -------------------------------------------------------------------------------- /shiradl/dl.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | import re 4 | import shutil 5 | import subprocess 6 | from pathlib import Path 7 | 8 | from yt_dlp import YoutubeDL 9 | from ytmusicapi import YTMusic 10 | 11 | from .metadata import clean_title, get_year 12 | from .tagging import MV_SEPARATOR_VISUAL, Tags, get_cover 13 | 14 | 15 | class Dl: 16 | def __init__( 17 | self, 18 | final_path: Path, 19 | temp_path: Path, 20 | cookies_location: Path, 21 | ffmpeg_location: Path, 22 | itag: str, 23 | cover_size: int, 24 | cover_format: str, 25 | cover_quality: int, 26 | template_folder: str, 27 | template_file: str, 28 | exclude_tags: str | None, 29 | truncate: int, 30 | dump_json: bool = False, 31 | use_playlist_name: bool = False, 32 | **kwargs, 33 | ): 34 | 35 | self.ytmusic = YTMusic() 36 | self.final_path = final_path 37 | self.temp_path = temp_path 38 | self.cookies_location = cookies_location 39 | self.ffmpeg_location = ffmpeg_location 40 | self.itag = itag 41 | self.cover_size = cover_size 42 | self.cover_format = cover_format 43 | self.cover_quality = cover_quality 44 | self.template_folder = template_folder 45 | self.template_file = template_file 46 | self.exclude_tags = [i.lower() for i in exclude_tags.split(",")] if exclude_tags is not None else [] 47 | self.truncate = None if truncate is not None and truncate < 4 else truncate 48 | 49 | self.dump_json = dump_json 50 | self.tags: Tags | None = None 51 | self.soundcloud = False 52 | self.default_ydl_opts = {"progress": True, "quiet": True, "no_warnings": True, "fixup": "never"} 53 | self.use_playlist_name = use_playlist_name 54 | 55 | def get_ydl_extract_info(self, url) -> dict: 56 | ydl_opts: dict[str, str | bool] = {"quiet": True, "no_warnings": True, "extract_flat": True} 57 | if self.cookies_location is not None: 58 | ydl_opts["cookiefile"] = str(self.cookies_location) 59 | with YoutubeDL(ydl_opts) as ydl: 60 | info = ydl.extract_info(url, download=False) 61 | if info is None: 62 | raise Exception(f"Failed to extract info for {url}") 63 | return info 64 | 65 | def get_download_queue(self, url): 66 | url = url.split("&")[0] 67 | download_queue = [] 68 | ydl_extract_info: dict = self.get_ydl_extract_info(url) 69 | 70 | if self.dump_json: 71 | # audio_formats = [ x for x in ydl_extract_info["formats"] if "acodec" in x and x["acodec"] != "none" ] 72 | # audio_formats = sorted(audio_formats, key = lambda x: x["quality"], reverse=True) 73 | 74 | f = open("info.json", "w", encoding="utf8") 75 | json.dump(ydl_extract_info, f, indent=4, ensure_ascii=False) 76 | f.close() 77 | 78 | if "soundcloud" in ydl_extract_info["webpage_url"] : 79 | # f = open("info.json", "w", encoding="utf8") 80 | # json.dump(ydl_extract_info, f, indent=4, ensure_ascii=False) 81 | # f.close() 82 | 83 | # raise Exception("Not a YouTube URL") 84 | if str(self.final_path) == "./YouTube Music": 85 | self.final_path = Path("./SoundCloud") 86 | self.soundcloud = True 87 | if "MPREb_" in ydl_extract_info["webpage_url_basename"]: 88 | ydl_extract_info = self.get_ydl_extract_info(ydl_extract_info["url"]) 89 | if "playlist" in ydl_extract_info["webpage_url_basename"]: 90 | if self.use_playlist_name: 91 | playlist_name = ydl_extract_info.get("title", "Unknown Playlist") 92 | self.final_path = self.final_path / self.get_sanizated_string(playlist_name, True) 93 | download_queue.extend(ydl_extract_info["entries"]) 94 | if "watch" in ydl_extract_info["webpage_url_basename"] or self.soundcloud: 95 | download_queue.append(ydl_extract_info) 96 | return download_queue 97 | 98 | def get_artist(self, artist_list): 99 | if len(artist_list) == 1: 100 | return artist_list[0]["name"] 101 | return ", ".join([i["name"] for i in artist_list][:-1]) + f' & {artist_list[-1]["name"]}' 102 | 103 | def get_ytmusic_watch_playlist(self, video_id): 104 | if self.soundcloud: 105 | return None 106 | ytmusic_watch_playlist = self.ytmusic.get_watch_playlist(video_id) 107 | if ytmusic_watch_playlist is None or isinstance(ytmusic_watch_playlist, str): 108 | raise Exception(f"Track is not available (None or string) {video_id}") 109 | 110 | if not ytmusic_watch_playlist["tracks"][0]["length"] and ytmusic_watch_playlist["tracks"][0].get("album"): # type: ignore 111 | raise Exception(f"Track is not available {video_id}") 112 | if not ytmusic_watch_playlist["tracks"][0].get("album"): # type: ignore 113 | return None 114 | return ytmusic_watch_playlist 115 | 116 | def search_track(self, title): 117 | return self.ytmusic.search(title, "songs")[0]["videoId"] 118 | 119 | @functools.lru_cache 120 | def get_ytmusic_album(self, browse_id): 121 | return self.ytmusic.get_album(browse_id) 122 | 123 | def get_tags(self, ytmusic_watch_playlist, track: dict[str, str | int]) -> Tags: 124 | if self.tags is None: 125 | return self.__collect_tags(ytmusic_watch_playlist, track) 126 | else: 127 | return self.tags 128 | 129 | def __collect_tags(self, ytmusic_watch_playlist, track: dict[str, str | int]): 130 | """collects tag information into self.tags""" 131 | if self.tags is not None: 132 | return self.tags 133 | 134 | video_id = ytmusic_watch_playlist["tracks"][0]["videoId"] 135 | ytmusic_album: dict = self.ytmusic.get_album(ytmusic_watch_playlist["tracks"][0]["album"]["id"]) 136 | _year, _date = get_year(track, ytmusic_album) 137 | tags: Tags = { 138 | "title": clean_title(ytmusic_watch_playlist["tracks"][0]["title"]), 139 | "album": ytmusic_album["title"], 140 | "albumartist": self.get_artist(ytmusic_album["artists"]), 141 | "artist": self.get_artist(ytmusic_watch_playlist["tracks"][0]["artists"]), 142 | "comments": f"https://music.youtube.com/watch?v={video_id}", 143 | "track": 1, 144 | "tracktotal": ytmusic_album["trackCount"], 145 | "date": _date, 146 | "year": _year, 147 | "cover_url": f'{ytmusic_watch_playlist["tracks"][0]["thumbnail"][0]["url"].split("=")[0]}' 148 | + f'=w{self.cover_size}-l{self.cover_quality}-{"rj" if self.cover_format == "jpg" else "rp"}' 149 | } 150 | 151 | for i, video in enumerate(self.get_ydl_extract_info(f'https://www.youtube.com/playlist?list={str(ytmusic_album["audioPlaylistId"])}')["entries"]): 152 | if video["id"] == video_id: 153 | tags["track"] = i + 1 154 | break 155 | if ytmusic_watch_playlist["lyrics"]: 156 | lyrics_data = self.ytmusic.get_lyrics(ytmusic_watch_playlist["lyrics"]) 157 | if lyrics_data is not None and "lyrics" in lyrics_data: 158 | tags["lyrics"] = lyrics_data["lyrics"] 159 | 160 | self.tags = tags 161 | return self.tags 162 | 163 | def get_sanizated_string(self, dirty_string, is_folder): 164 | dirty_string = re.sub(r'[\\/:*?"<>|;]', "_", dirty_string) 165 | if is_folder: 166 | dirty_string = dirty_string[: self.truncate] 167 | if dirty_string.endswith("."): 168 | dirty_string = dirty_string[:-1] + "_" 169 | else: 170 | if self.truncate is not None: 171 | dirty_string = dirty_string[: self.truncate - 4] 172 | return dirty_string.strip() 173 | 174 | def get_temp_location(self, song_id): 175 | if self.soundcloud: 176 | return self.temp_path / f"{song_id}.mp3" 177 | return self.temp_path / f"{song_id}.m4a" 178 | 179 | def get_fixed_location(self, song_id): 180 | if self.soundcloud: 181 | return self.temp_path / f"{song_id}_fixed.mp3" 182 | return self.temp_path / f"{song_id}_fixed.m4a" 183 | 184 | def get_final_location(self, tags, extension = ".m4a", is_single = False, single_folders = False): 185 | final_location_folder = self.template_folder.split("/") 186 | final_location_file = self.template_file.split("/") 187 | 188 | if is_single and not single_folders and self.template_folder.endswith("/{album}"): 189 | folder = self.template_folder[:-8] 190 | if len(folder.strip()) == 0: 191 | folder = "./" 192 | final_location_folder = folder.split("/") 193 | if (self.template_file.startswith("{track:02d} ")): 194 | locfile = self.template_file[12:] 195 | final_location_file = "{title}".split("/") if locfile.strip() == "" else locfile.split("/") 196 | 197 | filename_safe_tags: dict[str, str] = {} 198 | for k, v in tags.items(): # join artists with & so filenames aren't like ['Artist1', 'Artist2'] but rather Artist1 & Artist2 199 | if isinstance(v, list): 200 | filename_safe_tags[k] = MV_SEPARATOR_VISUAL.join([ vv if isinstance(vv, str) else vv.decode("utf-8") for vv in v ]) 201 | else: 202 | filename_safe_tags[k] = v 203 | 204 | # pprint(filename_safe_tags) 205 | 206 | final_location_folder = [self.get_sanizated_string(i.format(**filename_safe_tags), True) for i in final_location_folder] 207 | final_location_file = [self.get_sanizated_string(i.format(**filename_safe_tags), True) for i in final_location_file[:-1]] + [ 208 | self.get_sanizated_string(final_location_file[-1].format(**filename_safe_tags), False) + extension 209 | ] 210 | return self.final_path.joinpath(*final_location_folder).joinpath(*final_location_file) 211 | 212 | def get_cover_location(self, final_location): 213 | return final_location.parent / f"Cover.{self.cover_format}" 214 | 215 | def download(self, video_id, temp_location): 216 | ydl_opts = {**self.default_ydl_opts, "format": self.itag, "outtmpl": str(temp_location)} 217 | 218 | if self.cookies_location is not None: 219 | ydl_opts["cookiefile"] = str(self.cookies_location) 220 | with YoutubeDL(ydl_opts) as ydl: 221 | ydl.download("music.youtube.com/watch?v=" + video_id) 222 | 223 | def download_souncloud(self, url, temp_location): 224 | # opus is obviously a better format, however: 225 | # it's debatable whether soundcloud's mp3 is better than their opus 226 | # because they might just use lower quality audio for opus (there have been complaints) 227 | # this can be possibly later changed, for now we'll stick to mp3 228 | ydl_opts = {**self.default_ydl_opts, "format": "mp3", "outtmpl": str(temp_location)} 229 | 230 | if self.cookies_location is not None: 231 | ydl_opts["cookiefile"] = str(self.cookies_location) 232 | with YoutubeDL(ydl_opts) as ydl: 233 | ydl.download(url) 234 | 235 | def fixup(self, temp_location, fixed_location): 236 | fixup = [self.ffmpeg_location, "-loglevel", "error", "-i", temp_location] 237 | codec = self.get_audio_codec(temp_location) 238 | if codec == "opus": 239 | fixup.extend(["-f", "mp4"]) 240 | subprocess.run([*fixup, "-movflags", "+faststart", "-c", "copy", fixed_location], check=True) 241 | 242 | def move_to_final_location(self, fixed_location, final_location): 243 | final_location.parent.mkdir(parents=True, exist_ok=True) 244 | shutil.move(fixed_location, final_location) 245 | 246 | def save_cover(self, tags, cover_location): 247 | with open(cover_location, "wb") as f: 248 | f.write(get_cover(tags["cover_url"])) 249 | 250 | def cleanup(self): 251 | shutil.rmtree(self.temp_path) 252 | 253 | def get_audio_codec(self, file_path): 254 | """Use ffprobe to extract the audio codec of the given file.""" 255 | # TODO make sure ffprobe is in path as well? otherwise just allow pre-determined codecs like before this MR 256 | cmd = [ 257 | "ffprobe", 258 | "-v", "error", 259 | "-select_streams", "a:0", 260 | "-show_entries", "stream=codec_name", 261 | "-of", "json", 262 | str(file_path) 263 | ] 264 | # Run ffprobe and parse output 265 | result = subprocess.run(cmd, capture_output=True, text=True, check=True) 266 | codec_info = json.loads(result.stdout) 267 | # Extract and return codec name 268 | return codec_info["streams"][0]["codec_name"] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

shira

2 |

3 |

A smart music downloader

4 |

5 | Download music from YouTube, YouTube Music and Soundcloud,
with great metadata and little effort.
6 |

7 | 8 | ## Installation 9 | - Have [python](https://www.python.org/downloads/) (**3.11+**) and [pipx](https://pipx.pypa.io/stable/installation/#installing-pipx) installed 10 | - Have `ffmpeg` installed (See [Installing ffmpeg](#installing-ffmpeg)) and added to PATH, or [specify it with `--ffmpeg-location`](#configuration)/[config](#configuration) 11 | - `pipx install git+https://github.com/KraXen72/shira` 12 | 13 | **Guides**: [Using a cookies file](#setting-a-cookies-file), --> [Troubleshooting](#troubleshooting) <-- 14 | 15 | ## Usage Examples 16 | - `shiradl https://music.youtube.com/watch?v=HdX2COsY2Xk` **YouTube Music** 17 | - `shiradl "https://music.youtube.com/watch?v=8YwKlPH93Ps&list=PLC1og_v3eb4jE0bmdkWtizrSQ4zt86-3D"` 18 | - `shiradl https://www.youtube.com/watch?v=X0-AvRA7kB0` **YouTube (video)** 19 | - `shiradl https://soundcloud.com/neffexmusic/fight-back` **SoundCloud** 20 | - `shiradl https://music.youtube.com/playlist?list=PLC1og_v3eb4jE0bmdkWtizrSQ4zt86-3D` **Album/Playlist** 21 | - `shiradl -u ./links.txt` **List of links to download** 22 | - [See all cli options/flags](#Configuration) 23 | 24 | ## Goals 25 | - Provide an easy way to download audio from YouTube Music, YouTube or SoundCloud 26 | - Instead of a GUI/manual input for some steps like in [tiger](https://github.com/KraXen72/tiger), shira requires no additional user input once ran. 27 | - Provide objectively correct or at least very reasonable music metadata & properly tag music files. 28 | - objectively correct: Shira queries the [MusicBrainz Database](https://musicbrainz.org) and [YouTube Music's API](https://github.com/sigma67/ytmusicapi) to get metadata 29 | - very reasonable: When downloading a Youtube video, tags will be inferred from the video info: `title`, `channel_name`, `description`, `upload_date`, etc. 30 | 31 | ## Tagging 32 | - Adds a [lot of metadata](#tag-variables) to music files, in these [native tags](https://github.com/OxygenCobalt/Auxio/wiki/Supported-Metadata) (m4a, mp3) 33 | - Embeds proper `m4a` (iTunes) and `.mp3` (ID3v2.4) tags with [mediafile](https://github.com/beetbox/mediafile) 34 | - Uses [YouTube Music's API](https://github.com/sigma67/ytmusicapi) to get info. 35 | - Uses [MusicBrainz API](https://musicbrainz.org/doc/MusicBrainz_API) to resolve MusicBrainz ID's from their api 36 | - `track`, `album`, `artist`, `albumartist` ids 37 | - falls back to `artist`, `albumartist` if this recording can't be found, but artist can. 38 | - uses my custom smart-metadata system from [tiger](https://github.com/KraXen72/tiger) for non-music videos 39 | - collects as much information as possible for each tag, and selects the value with most occurences (with fallbacks) 40 | - Cleans up messy titles into more reasonable ones: 41 | - `IDOL【ENGLISH EDM COVER】「アイドル」 by ARTIST【Artist1 x @Artist2 】` => 42 | - `IDOL [ENGLISH EDM COVER] [アイドル] by ARTIST` 43 | - Is smart about turning a video's thumbnail into a square album cover 44 | 45 |
46 | More info about YouTube thumbnail to Album Art algorithm 47 |
    48 |
  1. samples 4 pixels near the corners of the thumbnail (which is first smoothed and reduced to 64 colors)
  2. 49 |
  3. decides to crop if average of standard deviations of r, g and b color channels from each sample point is lower than a than a threshold
  4. 50 |
  5. otherwise pads the image to 1:1 with it's dominant color
  6. 51 |
52 |
53 | 54 | ## About & Credits 55 | - **This software is for educational purposes only and comes without any warranty**; See [LICENSE](./LICENSE). 56 | - Credits for copyright-free example tracks used: [Andy Leech](https://soundcloud.com/andyleechmusic), [4lienetic](https://soundcloud.com/4lienetic), [NEFFEX](https://soundcloud.com/neffexmusic) 57 | - The name **Shira** was inspired by a saber-toothed [tiger](https://github.com/KraXen72/tiger) from [Ice Age](https://iceage.fandom.com/wiki/Shira). 58 | - It also means ['poetry', 'singing' or 'music'](https://www.wikiwand.com/en/Shira_(given_name)) in Hebrew. 59 | - The project is based on my previous [YouTube downloader tiger](https://github.com/KraXen72/tiger) and [Glomatico's YouTube Music Downloader](https://github.com/glomatico/gytmdl) 60 | - Project logo is based on this [DeviantArt fanart](https://www.deviantart.com/f-a-e-l-e-s/art/Ice-age-5-Shira-and-Diego-757174602), which has been modified, vectorised and cleaned up. 61 | 62 | ### Support development 63 | [![Recurring donation via Liberapay](https://liberapay.com/assets/widgets/donate.svg)](https://liberapay.com/KraXen72) [![One-time donation via ko-fi.com](https://ko-fi.com/img/githubbutton_sm.svg)](https://ko-fi.com/kraxen72) 64 | Any donations are highly appreciated! <3 65 | 66 | ## Configuration 67 | Shira can be configured using the command line arguments or the config file. 68 | The config file is created automatically when you run shira for the first time at `~/.shiradl/config.json` on Linux and `%USERPROFILE%\.shiradl\config.json` on Windows. Config file values can be overridden using command line arguments. 69 | 70 | | Command line argument / Config file key | Description | Default value | 71 | | --- | --- | --- | 72 | | `-f`, `--final-path` / `final_path` | Path where the downloaded files will be saved. | `./YouTube Music` | 73 | | `-t`, `--temp-path` / `temp_path` | Path where the temporary files will be saved. | `./temp` | 74 | | `-c`, `--cookies-location` / `cookies_location` | Location of the cookies file. | `null` | 75 | | `--ffmpeg-location` / `ffmpeg_location` | Location of the FFmpeg binary. | `ffmpeg` | 76 | | `--config-location` / - | Location of the config file. | `/.shiradl/config.json` | 77 | | `-i`, `--itag` / `itag` | Itag (audio quality/format). [More info](#itags) | `140` | 78 | | `--cover-size` / `cover_size` | Size of the cover. `size >= 0` and `<= 16383` | `1200` | 79 | | `--cover-format` / `cover_format` | Format of the cover. `jpg` or `png` | `jpg` | 80 | | `--cover-quality` / `cover_quality` | JPEG quality of the cover. [1<=x<=100] | `94` | 81 | | `--cover-img` / `cover_img` | Path to image or folder of images. [More info](#cover-img) | `null` | 82 | | `--cover-crop` / `cover_crop` | 'crop' takes a 1:1 square from the center, pad always pads top & bottom. `auto`, `crop` or `pad` | `auto` - [More info](#smartcrop) | 83 | | `--template-folder` / `template_folder` | Template of the album folders as a format string. | `{albumartist}/{album}` | 84 | | `--template-file` / `template_file` | Template of the track files as a format string. | `{track:02d} {title}` | 85 | | `-e`, `--exclude-tags` / `exclude_tags` | List of tags to exclude from file tagging separated by commas without spaces. | `null` | 86 | | `--truncate` / `truncate` | Maximum length of the file/folder names. | `40` | 87 | | `-l`, `--log-level` / `log_level` | Log level. | `INFO` | 88 | | `-s`, `--save-cover` / `save_cover` | Save cover as a separate file. | `false` | 89 | | `-o`, `--overwrite` / `overwrite` | Overwrite existing files. | `false` | 90 | | `-p`, `--print-exceptions` / `print_exceptions` | Print exceptions. | `false` | 91 | | `-u`, `--url-txt` / - | Read URLs as location of text files containing URLs. | `false` | 92 | | `-n`, `--no-config-file` / - | Don't use the config file. | `false` | 93 | | `-w`, `--single-folder` / - | Wrap singles in their own folder instead of placing them directly into artist's folder. | `false` | 94 | 95 | ### Itags 96 | The following itags are available: 97 | - `140` (128kbps AAC) - default, because it's the result of `bestaudio/best` on a free account 98 | - `141` (256kbps AAC) - use if you have premium alongside `--cookies-location` 99 | - `251` (128kbps Opus) - most stuff will error with `Failed to check URL 1/1`. Better to use `140` 100 | 101 | SoundCloud will always download in 128kbps MP3 102 | - SoundCloud also offers OPUS, which is currently not supported. [Some people were complaining](https://www.factmag.com/2018/01/04/soundcloud-mp3-opus-format-sound-quality-change-64-128-kbps/) that the quality is worse 103 | - [These are questionable claims](https://old.reddit.com/r/Techno/comments/bzodax/soundcloud_compression_128kbps_mp3_vs_64_kbps/) at best, but better safe than sorry. 104 | 105 | ### Tag variables 106 | The following variables can be used in the template folder/file and/or in the `exclude_tags` list: 107 | `title`, `album`, `artist`, `albumartist`, `track`, `tracktotal`, `year`, `date`, `cover`, `comments`, `lyrics`, `media_type`, `rating`, `track`, `tracktotal`, `mb_releasetrackid`, `mb_releasegroupid`, `mb_artistid`, `mb_albumartistid` 108 | To exclude all musicbrainz tags, you can add `mb*` to `exclude_tags`. (This does not work for other types of tags). 109 | 110 | ### Cover formats 111 | Can be either `jpg` or `png`. 112 | 113 | ### Cover img 114 | - Pass in a path to an image file, and it will get used for all of the links you're currently downloading. 115 | - Pass in a path to a folder, and the script will use the first image matching the track/video id and jpeg/png format 116 | - You don't have to create covers for all tracks/videos in the playlist/album/etc. 117 | - SoundCloud will also consider images based on the URL slug instead of id 118 | - *for example*: `https://soundcloud.com/yatashi-gang-63564467/lovely-bastards-yatashigang` => `lovely-bastards-yatashigang.jpg` / `.png` 119 | 120 | ## Troubleshooting 121 | - run `pipx upgrade shiradl --pip-args='--upgrade-strategy=eager'`, as it's likely yt-dlp or something else needs updating 122 | - In case shira can't download songs / you're having other issues: 123 | - as a temporary measure, you can [try these steps](https://github.com/KraXen72/shira/issues/19#issuecomment-2661907637) 124 | - `python: No module named shiradl` 125 | - Make sure you are not already in the `shiradl` directory, e.g. `/shira/shiradl`. if yes, move up one directory with `cd ..` and retry. 126 | - I really need to run this on `python` 3.8+ and updating to 3.11+ is not an option 127 | - run `pip install typing-extensions` and modify `tagging.py` accordingly: 128 | ```diff 129 | - from typing import NotRequired, TypedDict 130 | + from typing_extensions import NotRequired, TypedDict 131 | ``` 132 | 133 | ### Installing ffmpeg 134 | #### Installing ffmpeg with scoop 135 | - Scoop is a package manager for windows. It allows easy installing of programs and their updating from the commandline. 136 | - Install [scoop](https://scoop.sh) by running a powershell command (on their website) 137 | - Run `scoop install main/ffmpeg` 138 | - Scoop automatically adds it to path. you can update ffmpeg by doing `scoop update` and `scoop update ffmpeg`/`*` 139 | - If installing scoop/with scoop is not an option, continue reading: 140 | #### Installing ffmpeg on Windows (manual install) 141 | - Related: [Comprehensive tutorial with screenshots](https://phoenixnap.com/kb/ffmpeg-windows) 142 | - Download an auto-built zip of latest ffmpeg: [download](https://www.gyan.dev/ffmpeg/builds/) / [mirror](https://github.com/BtbN/FFmpeg-Builds/releases). 143 | - Extract it somewhere, for example into `C:\ffmpeg`. It's best if the path doesen't have spaces. 144 | ##### Adding ffmpeg to PATH 145 | - Look for `Edit the system environment variables` in the Start Menu, launch it. 146 | - Find the `Path` user variable, click `Edit` 147 | - Click `New` on the side and enter the path to the `ffmpeg\bin` folder which has `ffmpeg.exe` in it, e.g. `C:\ffmpeg\bin` 148 | - Click `Ok`. To verify that `ffmpeg` is installed, run `ffmpeg -version` in the terminal. 149 | #### Pointing to ffmpeg manually 150 | - If you do not want to add `ffmpeg` to path, you can point to it manually. 151 | - Use the [config](#configuration) option `ffmpeg_location` or the cli flag `--ffmpeg-location` to point to the `ffmpeg.exe` file. 152 | - Keep the `ffplay.exe` and `ffprobe.exe` files in the same directory. 153 | #### Installing ffmpeg on linux 154 | - use your distro's package manager to install `ffmpeg` - 155 | 156 | ### Setting a cookies file 157 | - By setting a cookies file, you can download age restricted tracks, private playlists and songs in 256kbps AAC if you are a premium user. 158 | - You can export your cookies to a file by using this [Google Chrome extension](https://chrome.google.com/webstore/detail/gdocmgbfkjnnpapoeobnolbbkoibbcif) or [Firefox extension](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) on `https://music.youtube.com` 159 | 160 | ## Contributing 161 | - Please report any bugs in Issues. Pull requests are welcome! 162 | - To contribute, you'll (likely) need a local installation of shira 163 | - Fork this repo 164 | - Verify [installation prerequisites](#Installation) 165 | - Install dependencies locally with `pip install .` (if you know of a better way, lmk) 166 | - Make changes 167 | - Open a pull request 168 | - If you're planning on implementing something big / that changes a lot, it's worth opening an issue about it to discuss it first. 169 | - Thanks! -------------------------------------------------------------------------------- /shiradl/musicbrainz.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import TypedDict 4 | 5 | from requests_cache import CachedSession 6 | 7 | from .__init__ import __version__ as shiraver 8 | from .metadata import clean_title, parse_datestring 9 | from .tagging import Tags 10 | 11 | # it's better if this is a "submodule" of shira (a part of it) 12 | # works on it's own (name == __main__), but everything apart from the musibrainz logic doesen't live in it 13 | # it's in a separate python module is to have a separate command & to separate the code 14 | 15 | # at some point, i might have to just switch this to depend on picard itself or it's submodule - i can only get so far with lookup 16 | # acoutsid fingerprinting might be a good idea 17 | # however, even then, it's not 100% accurate... 18 | 19 | MBArtist = TypedDict("MBArtist", { 20 | "id": str, 21 | "name": str, 22 | "sort-name": str, 23 | }) 24 | 25 | MBArtistCredit = TypedDict("MBArtistCredit", { 26 | "name": str, 27 | "sort-name": str, 28 | "artist": MBArtist 29 | }) 30 | 31 | MBRelease = TypedDict("MBRelease", { 32 | "id": str, 33 | "title": str, 34 | "artist-credit": list[MBArtistCredit], 35 | "release-group": dict[str, str], 36 | "date": str 37 | }) 38 | 39 | MBRecording = TypedDict("MBRecording", { 40 | "id": str, 41 | "title": str, 42 | "artist-credit": list[MBArtistCredit], 43 | "releases": list[MBRelease] 44 | }) 45 | 46 | leading_zero_re = r"(?<=\b)0+(?=[1-9])" # strips all leading zeros 47 | 48 | # MusicBrainz usually has songs with feat. in the title without it 49 | # look i'm not stoked about this regex but it works 50 | title_feat_re = r"\s?(?:(ft\. \b.+\b)|(\(feat\.?.+\)))" 51 | 52 | yeet_regexes = [ 53 | leading_zero_re, 54 | title_feat_re, 55 | r"," 56 | ] 57 | 58 | hyphens_re = r"‐|‑|‒|–|—|―|⁃|-" # non-standard hyphens 59 | 60 | def normalized_compare_regex(in1: str, in2: str, strict = True, debug = False): 61 | """ 62 | compares 2 strings after normalization 63 | - e.g. 2:09 matches 02:09 64 | - e.g. Sci-Fi matches Sci—Fi 65 | :param strict: if off, it will check if in1 is a substring of in2 rather than direct comparison 66 | """ 67 | expr = [in1.lower().strip(), in2.lower().strip()] 68 | for i in range(len(expr)): 69 | for yeet_re in yeet_regexes: 70 | expr[i] = re.sub(yeet_re, "", expr[i]) 71 | expr[i] = expr[i].replace("/", "/") 72 | expr[i] = re.sub(hyphens_re, "-", expr[i]) 73 | expr[i] = expr[i].strip() 74 | 75 | if debug: 76 | print(f"e1: {in1} e2: {in2}, strict:{strict}") 77 | print(f"out: e1: {expr[0]} e2: {expr[1]}") 78 | 79 | return expr[0] == expr[1] if strict else expr[0] in expr[1] 80 | 81 | def check_bareartist_match(artist: str, a_dict: MBArtist): 82 | """fuzzy song artist (single/bare) matching""" 83 | return artist == a_dict["name"] or artist.lower() == a_dict["name"].lower() \ 84 | or artist == a_dict["sort-name"] or artist.lower() == a_dict["sort-name"].lower() 85 | 86 | def check_artist_match(artist: str, acred_list: list[MBArtistCredit]): 87 | """fuzzy song artist matching (matches serveral artists as well)""" 88 | if len(acred_list) > 1: 89 | # not using ARTIST_SEPARATOR here because ytmusic joins artists by & 90 | joinphrase = str(acred_list[0].get("joinphrase")).strip() or "&" 91 | yt_artists = [a.strip() for a in artist.split(joinphrase)] 92 | 93 | all_artists_match = True 94 | for yta in yt_artists: 95 | found_match = False 96 | for acred in acred_list: 97 | if check_bareartist_match(yta, acred["artist"]): 98 | found_match = True 99 | break 100 | if not found_match: 101 | all_artists_match = False 102 | break 103 | 104 | return all_artists_match 105 | else: 106 | return check_bareartist_match(artist, acred_list[0]["artist"]) 107 | 108 | def check_barealbum_match(album: str, r_dict: MBRelease): 109 | """semi-strict album match checker""" 110 | return album == r_dict["title"] or album.replace("(Single)", "").strip() == r_dict["title"] \ 111 | or album.lower() == r_dict["title"].lower() or album.replace("(Single)", "").strip().lower() == r_dict["title"].lower() \ 112 | or normalized_compare_regex(album, r_dict["title"]) 113 | 114 | def check_barealbum_match2(album: str, r_dict: MBRelease): 115 | """looser check_barealbum_match if title_match and artist_match are both true """ 116 | return album in r_dict["title"] or album.replace("(Single)", "").strip() in r_dict["title"] \ 117 | or album.lower() in r_dict["title"].lower() or album.replace("(Single)", "").strip().lower() in r_dict["title"].lower() \ 118 | or normalized_compare_regex(album, r_dict["title"], strict=False) 119 | 120 | def check_album_match(album: str, r_dict: MBRelease, title_match: bool, artist_match: bool): 121 | """fuzzy song album matching""" 122 | if title_match and artist_match: 123 | # exception: if title & artist match, allow mbid album to be a superset (contain) album needle 124 | # e.g. album="Meet the Woo" would match "Meet the Woo, V.2", but not the other way around 125 | # this is pretty damn loose at this point but we trust in MusicBrainz API result ordering 126 | return check_barealbum_match2(album, r_dict) 127 | else: 128 | return check_barealbum_match(album, r_dict) 129 | 130 | def check_title_match(title: str, r_dict: MBRecording, debug = False): 131 | """fuzzy song title matching""" 132 | return title == r_dict["title"] or title.lower() == r_dict["title"].lower() \ 133 | or normalized_compare_regex(title, r_dict["title"], debug=debug) 134 | 135 | def get_mb_artistids(a_list: list[MBArtistCredit], return_single = False): 136 | """get artist mdid or list of mbids""" 137 | if len(a_list) == 1 or return_single: 138 | return a_list[0]["artist"]["id"] 139 | else: 140 | return [ a["artist"]["id"] for a in a_list ] 141 | 142 | 143 | class MBSong: 144 | """MusicBrainz song item""" 145 | def __init__( 146 | self, 147 | title: str = "", 148 | artist: str = "", 149 | album: str = "", 150 | debug = False, 151 | skip_clean_title = False, 152 | cache_lifetime_seconds = 3600 153 | ): 154 | if title == "": 155 | raise Exception("title is required") 156 | self.title = title if skip_clean_title else clean_title(title) 157 | self.artist = artist 158 | self.album = album 159 | self.base = "https://musicbrainz.org/ws/2" 160 | self.default_params = { "fmt": "json" } 161 | self.req = CachedSession("shira", expire_after=cache_lifetime_seconds, use_cache_dir=True) 162 | self.head = { "User-Agent": f"shiradl+mbtag/{shiraver} ( https://github.com/KraXen72/shira )" } 163 | 164 | self.song_dict = None # MBRecording 165 | self.artist_dict = None # MBArtistCredit 166 | self.album_dict = None # MBRelease 167 | 168 | self.mb_releasetrackid = None # song mbid 169 | self.mb_releasegroupid = None # album mbid 170 | self.mb_artistid = None # artist mbid 171 | self.debug = debug 172 | 173 | def fetch_song(self): 174 | """ 175 | ping mb api to get song (/recording) 176 | subsequently calls fetch_arist if nothing is found 177 | """ 178 | params = { 179 | "query": f'{self.title} AND artist:"{self.artist}" AND release:"{self.album}"', 180 | **self.default_params 181 | } 182 | res = self.req.get(f"{self.base}/recording", params=params, headers=self.head) 183 | if self.debug: 184 | print(res.url, res.status_code) 185 | print("fetch_song query:", params["query"]) 186 | if res.status_code and res.status_code >= 200 and res.status_code < 300: 187 | resjson = json.loads(res.text) 188 | self.save_song_dict(resjson["recordings"]) 189 | else: 190 | raise Exception(f"fetch_song: status code {res.status_code}") 191 | 192 | def fetch_artist(self): 193 | """ping mb api to get artist (/artist)""" 194 | params = { 195 | "query": self.artist, 196 | **self.default_params 197 | } 198 | res = self.req.get(f"{self.base}/artist", params=params, headers=self.head) 199 | if self.debug: 200 | print(res.url) 201 | print("fetch_artist query:", params["query"]) 202 | if res.status_code and res.status_code >= 200 and res.status_code < 300: 203 | resjson = json.loads(res.text) 204 | self.save_artist_dict(resjson["artists"]) 205 | 206 | def _debug_print_track(self, track: MBRecording, titm: bool, artm: bool, albm: bool): 207 | if not self.debug: 208 | return 209 | print(f"matches: title:{titm}, artist:{artm}, album:{albm}") 210 | print( 211 | track["artist-credit"][0]["artist"]["name"], 212 | "-", 213 | track["title"], 214 | [r["title"] for r in track["releases"]] 215 | ) 216 | print() 217 | 218 | def save_song_dict(self, tracks: list[MBRecording]): 219 | """find the most similar song""" 220 | 221 | if self.debug: 222 | f = open("info.json", "w", encoding="utf8") 223 | json.dump(tracks, f, indent=4, ensure_ascii=False) 224 | f.close() 225 | print("looking for:") 226 | print("title:", self.title) 227 | print("artist:", self.artist) 228 | print("album:", self.album) 229 | 230 | for t in tracks: 231 | if ("artist-credit" not in t) or (len(t["artist-credit"]) == 0) or ("releases" not in t) or (len(t["releases"]) == 0): 232 | continue 233 | 234 | title_match = check_title_match(self.title, t, self.debug) 235 | artist_match = False 236 | album_match = False 237 | 238 | if check_artist_match(self.artist, t["artist-credit"]): 239 | self.mb_artistid = get_mb_artistids(t["artist-credit"]) 240 | self.artist_dict = t["artist-credit"] 241 | artist_match = True 242 | 243 | for a in t["releases"]: 244 | if check_album_match(self.album, a, title_match, artist_match): 245 | self.mb_releasegroupid = a["release-group"]["id"] 246 | self.album_dict = a 247 | album_match = True 248 | self._debug_print_track(t, title_match, artist_match, album_match) 249 | break 250 | 251 | if title_match and artist_match and album_match: 252 | self.mb_releasetrackid = t["id"] 253 | self.song_dict = t 254 | self._debug_print_track(t, title_match, artist_match, album_match) 255 | break 256 | self._debug_print_track(t, title_match, artist_match, album_match) 257 | 258 | if self.song_dict is None: 259 | self.fetch_artist() 260 | 261 | def save_artist_dict(self, artists: list[MBArtist]): 262 | """find most similar artist""" 263 | for a in artists: 264 | if check_bareartist_match(self.artist, a): 265 | self.artist_dict = a 266 | self.mb_artistid = a["id"] 267 | break 268 | 269 | def get_date_str(self): 270 | return_val = None 271 | if self.song_dict is None: 272 | return None 273 | elif self.song_dict.get("first-release-date") is not None: 274 | return_val = self.song_dict.get("first-release-date") 275 | else: 276 | for r in self.song_dict["releases"]: 277 | if "date" not in r: 278 | continue 279 | return_val = r["date"] 280 | break 281 | 282 | if return_val is not None: 283 | if re.match(r"^\d{8}$", return_val) or re.match(r"^\d{4}-\d{2}-\d{2}$", return_val): 284 | return return_val 285 | elif re.match(r"^\d{4}-\d{2}$", return_val): 286 | return return_val + "-01" 287 | elif re.match(r"^\d{6}$", return_val): 288 | return return_val + "01" 289 | elif re.match(r"^\d{4}$", return_val): 290 | return return_val + "-01-01" 291 | else: 292 | print(f"unknown date format {return_val}, skipping date metadata") 293 | 294 | def get_mbid_tags(self): 295 | """get mbid tags with proper keys""" 296 | # !! make sure only supported fields are multi-value tags, otherwise auxio might crash (don't do multi-value album artists) 297 | first_mb_artistid = self.mb_artistid[0] if isinstance(self.mb_artistid, list) else self.mb_artistid 298 | 299 | return { 300 | "mb_releasetrackid": self.mb_releasetrackid, 301 | "mb_releasegroupid": self.mb_releasegroupid, 302 | "mb_artistid": self.mb_artistid, 303 | "mb_albumartistid": first_mb_artistid 304 | } 305 | 306 | 307 | def get_mb_tags(self): 308 | """ 309 | quickly get {title, artist, album} if it was fetched from MB, 310 | otherwise all 3 will be None. 311 | Does no fetching itself. 312 | """ 313 | artist = None 314 | if self.artist_dict is not None: 315 | artist = self.artist_dict[0]["artist"]["name"] if isinstance(self.artist_dict, list) else self.artist_dict["name"] 316 | return { 317 | "title": self.song_dict.get("title") if self.song_dict is not None else None, 318 | "artist": artist, 319 | "album": self.album_dict.get("title") if self.album_dict is not None else None, 320 | } 321 | 322 | def musicbrainz_enrich_tags(tags: Tags, skip_encode = False, exclude_tags: list[str] = [], use_mbid_data = True): # noqa: B006 323 | """takes in a tags dict, adds mbid tags and (by default) also other mb info, returns it""" 324 | 325 | mb = MBSong(title=tags["title"], artist=str(tags["artist"]), album=tags["album"]) 326 | mb.fetch_song() 327 | 328 | if use_mbid_data: 329 | if mb.artist_dict: 330 | if isinstance(mb.artist_dict, list): # TODO fix multi-value tags 331 | tags["artist"] = [a["artist"]["name"] for a in mb.artist_dict ] 332 | else: # TODO consider using mb.album_dict to get album artist? 333 | tags["artist"] = mb.artist_dict["name"] 334 | tags["albumartist"] = mb.artist_dict[0]["artist"]["name"] if isinstance(mb.artist_dict, list) else mb.artist_dict["name"] 335 | if mb.album_dict: 336 | tags["album"] = mb.album_dict["title"] 337 | if mb.song_dict: 338 | tags["title"] = mb.song_dict["title"] 339 | _release_date = mb.get_date_str() 340 | # print("mb", _release_date) 341 | if _release_date: 342 | tags["date"] = _release_date 343 | tags["year"] = parse_datestring(_release_date)["year"] 344 | 345 | if "mb*" in exclude_tags: 346 | return tags 347 | 348 | for key, tag in mb.get_mbid_tags().items(): 349 | if tag is not None and key not in exclude_tags: 350 | if skip_encode is False: 351 | tags[key] = [ t.encode("utf-8") for t in tag ] if isinstance(tag, list) else tag.encode("utf-8") 352 | else: 353 | tags[key] = tag 354 | return tags 355 | 356 | -------------------------------------------------------------------------------- /logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | --------------------------------------------------------------------------------