├── .binder ├── runtime.txt ├── apt.txt ├── start └── postBuild ├── data └── .gitignore ├── figures └── .gitignore ├── .gitignore ├── setup.py ├── LICENSE.txt ├── makefile ├── requirements.txt ├── webapi.ipynb ├── features.py ├── creation.py ├── usage.ipynb ├── utils.py ├── README.md ├── baselines.ipynb ├── analysis.ipynb └── creation.ipynb /.binder/runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.6 2 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /.binder/apt.txt: -------------------------------------------------------------------------------- 1 | graphviz 2 | ffmpeg 3 | -------------------------------------------------------------------------------- /figures/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /.binder/start: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export AUDIO_DIR=./data/fma_small/ 3 | exec "$@" 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # IPython checkpoints 6 | .ipynb_checkpoints/ 7 | 8 | # Environment 9 | .env 10 | .python-version 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='freemusicarchive', 4 | version='0.0.0', 5 | description='Free Music Archive', 6 | url='https://github.com/mdeff/fma', 7 | author='Michaël Defferrard', 8 | author_email='michael.defferrard@epfl.ch', 9 | license='MIT') 10 | -------------------------------------------------------------------------------- /.binder/postBuild: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | python3.6 -m venv ./env 4 | 5 | ./env/bin/pip install --upgrade pip setuptools wheel 6 | ./env/bin/pip install numpy==1.12.1 # workaround resampy's bogus setup.py 7 | ./env/bin/pip install -r requirements.txt 8 | 9 | # Shadow the default kernelspec for jupyter to use our environment by default. 10 | ./env/bin/python -m ipykernel install --user 11 | 12 | cd data 13 | curl -O https://os.unil.cloud.switch.ch/fma/fma_metadata.zip 14 | unzip fma_metadata.zip 15 | rm fma_metadata.zip 16 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Michaël Defferrard 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | NB = $(sort $(wildcard *.ipynb)) 2 | 3 | run: $(NB) 4 | 5 | $(NB): 6 | jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@ 7 | 8 | clean: 9 | rm -rf __pycache__/ .ipynb_checkpoints/ 10 | #jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB) 11 | @for nb in $(NB); do \ 12 | echo "$$(jq --indent 1 ' \ 13 | .metadata = {} \ 14 | | (.cells[] | select(has("outputs")) | .outputs) = [] \ 15 | | (.cells[] | select(has("execution_count")) | .execution_count) = null \ 16 | | .cells[].metadata = {} \ 17 | ' $$nb)" > $$nb; \ 18 | done 19 | 20 | # May be useful to keep for nbsphinx. 21 | # | .metadata = {"language_info": {"name": "python", "pygments_lexer": "ipython3"}} \ 22 | 23 | install: 24 | pip install --upgrade pip setuptools wheel 25 | pip install numpy==1.12.1 # bug: resampy imports numpy in setup.py 26 | # pip install setuptools==38.2.4 # MarkupSafe 1.0 setup.py needs `from setuptools import Feature` 27 | pip install -r requirements.txt 28 | 29 | readme: 30 | grip README.md 31 | 32 | html: 33 | grip --export README.md 34 | jupyter nbconvert $(NB) --to html 35 | 36 | .PHONY: run $(NB) clean install readme html 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Version numbers have been retrieved from a range of machines and environments. 2 | # Take them with a grain of salt. 3 | 4 | # Direct dependencies 5 | #python==3.6.0 6 | #pip==9.0.1 7 | #setuptools==38.2.4 # old for MarkupSafe 1.0 (28.8.0 is installed with py 3.6) 8 | numpy==1.12.1 # 1.12.0 9 | pandas==0.19.2 10 | matplotlib==2.0.0 11 | seaborn==0.7.1 12 | scikit-learn==0.18.1 13 | tensorflow-gpu==1.0.1 # 1.0.0 14 | Keras==1.2.2 # 2.0.2 / 2.0.3 15 | librosa==0.5.0 16 | audioread==2.1.4 17 | mutagen==1.39 # 1.39.dev0 18 | pydub==0.18.0 19 | #exiftool # Only considered at some point. 20 | #eyed3 # Only considered at some point. 21 | requests==2.13.0 22 | pydot==1.2.3 23 | tqdm==4.11.2 24 | python-dotenv==0.6.3 # 0.6.4 25 | 26 | # Dependencies of the above. 27 | certifi==2017.11.5 28 | click==6.7 29 | cycler==0.10.0 30 | Cython==0.25.2 31 | decorator==4.0.11 # 4.1.2 32 | joblib==0.11 33 | protobuf==3.2.0 34 | pyparsing==2.2.0 35 | python-dateutil==2.6.0 36 | pytz==2017.2 # 2016.10 / 2017.3 37 | PyYAML==3.12 38 | resampy==0.1.5 39 | scipy==0.19.0 # 0.18.1 40 | six==1.10.0 41 | Theano==0.9.0 # 0.8.2 42 | 43 | # Jupyter notebook and its dependencies. 44 | notebook==5.0.0 # 4.4.1 45 | ipywidgets==6.0.0 46 | bleach==2.0.0 # 1.5.0 47 | entrypoints==0.2.2 48 | html5lib==0.999999999 # 0.9999999 49 | ipykernel==4.6.0 # 4.5.2 / 4.6.1 50 | ipython==5.3.0 51 | ipython-genutils==0.2.0 # 0.1.0 52 | Jinja2==2.9.6 # 2.9.5 / 2.10 53 | jsonschema==2.6.0 54 | jupyter-client==5.0.0 # 5.0.1 55 | jupyter-core==4.3.0 56 | MarkupSafe==0.23 # 1.0 (requires an old setuptools for `from setuptools import Feature`) 57 | mistune==0.7.4 # 0.7.3 58 | nbconvert==5.1.1 59 | nbformat==4.3.0 60 | packaging==16.8 61 | pandocfilters==1.4.1 62 | pexpect==4.2.1 63 | pickleshare==0.7.4 64 | prompt-toolkit==1.0.14 # 1.0.13 65 | ptyprocess==0.5.1 66 | Pygments==2.2.0 67 | pyparsing==2.2.0 68 | pyzmq==16.0.2 69 | simplegeneric==0.8.1 70 | terminado==0.6 71 | testpath==0.3 72 | tornado==4.4.3 # 4.4.2 73 | traitlets==4.3.2 74 | wcwidth==0.1.7 75 | webencodings==0.5.1 76 | widgetsnbextension==2.0.0 77 | -------------------------------------------------------------------------------- /webapi.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n", 8 | "\n", 9 | "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n", 10 | "\n", 11 | "## Free Music Archive web API\n", 12 | "\n", 13 | "All the data in the `raw_*.csv` tables was collected from the Free Music Archive [public API](https://freemusicarchive.org/api). With this notebook, you can:\n", 14 | "* reconstruct the original data, \n", 15 | "* update some fields, e.g. the `track listens` (play count),\n", 16 | "* augment the data with newer fields wich may have been introduced in their API,\n", 17 | "* update the dataset with new songs added to the archive.\n", 18 | "\n", 19 | "Notes:\n", 20 | "* You need a key to access the API, which you can [request online](https://freemusicarchive.org/api/agreement) and write into your `.env` file as a new line reading `FMA_KEY=MYPERSONALKEY`.\n", 21 | "* Requests take some hunderd milliseconds to complete." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import os\n", 31 | "import IPython.display as ipd\n", 32 | "import utils" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "fma = utils.FreeMusicArchive(os.environ.get('FMA_KEY'))" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## 1 Get recently added tracks\n", 49 | "\n", 50 | "* `track_id` are assigned in monotonically increasing order.\n", 51 | "* Tracks can be removed, so that number does not indicate the number of available tracks." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "for track_id, artist_name, date_created in zip(*fma.get_recent_tracks()):\n", 61 | " print(track_id, date_created, artist_name)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## 2 Get metadata about tracks, albums and artists\n", 69 | "\n", 70 | "Given IDs, we can get information about tracks, albums and artists. See the available fields in the [API documentation](https://freemusicarchive.org/api)." 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "fma.get_track(track_id=2, fields=['track_title', 'track_date_created',\n", 80 | " 'track_duration', 'track_bit_rate',\n", 81 | " 'track_listens', 'track_interest', 'track_comments', 'track_favorites',\n", 82 | " 'artist_id', 'album_id'])" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "fma.get_track_genres(track_id=20)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "fma.get_album(album_id=1, fields=['album_title', 'album_tracks',\n", 101 | " 'album_listens', 'album_comments', 'album_favorites',\n", 102 | " 'album_date_created', 'album_date_released'])" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "fma.get_artist(artist_id=1, fields=['artist_name', 'artist_location',\n", 112 | " 'artist_comments', 'artist_favorites'])" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "## 3 Get data, i.e. raw audio\n", 120 | "\n", 121 | "We can download the original audio as well. Tracks are provided by the archive as MP3 with various bit and sample rates." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "track_file = fma.get_track(2, 'track_file')\n", 131 | "fma.download_track(track_file, path='track.mp3')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "## 4 Get genres\n", 139 | "\n", 140 | "Instead of compiling the genres of each track, we can get all the genres present on the archive with some API calls." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "genres = fma.get_all_genres()\n", 150 | "print('{} genres'.format(genres.shape[0]))\n", 151 | "genres[10:25]" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "And look for genres related to Rock." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "genres[['Rock' in title for title in genres['genre_title']]]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "genres[genres['genre_parent_id'] == '12']" 177 | ] 178 | } 179 | ], 180 | "metadata": {}, 181 | "nbformat": 4, 182 | "nbformat_minor": 2 183 | } 184 | -------------------------------------------------------------------------------- /features.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # FMA: A Dataset For Music Analysis 4 | # Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2. 5 | 6 | # All features are extracted using [librosa](https://github.com/librosa/librosa). 7 | # Alternatives: 8 | # * [Essentia](http://essentia.upf.edu) (C++ with Python bindings) 9 | # * [MARSYAS](https://github.com/marsyas/marsyas) (C++ with Python bindings) 10 | # * [RP extract](http://www.ifs.tuwien.ac.at/mir/downloads.html) (Matlab, Java, Python) 11 | # * [jMIR jAudio](http://jmir.sourceforge.net) (Java) 12 | # * [MIRtoolbox](https://www.jyu.fi/hum/laitokset/musiikki/en/research/coe/materials/mirtoolbox) (Matlab) 13 | 14 | import os 15 | import multiprocessing 16 | import warnings 17 | import numpy as np 18 | from scipy import stats 19 | import pandas as pd 20 | import librosa 21 | from tqdm import tqdm 22 | import utils 23 | 24 | 25 | def columns(): 26 | feature_sizes = dict(chroma_stft=12, chroma_cqt=12, chroma_cens=12, 27 | tonnetz=6, mfcc=20, rmse=1, zcr=1, 28 | spectral_centroid=1, spectral_bandwidth=1, 29 | spectral_contrast=7, spectral_rolloff=1) 30 | moments = ('mean', 'std', 'skew', 'kurtosis', 'median', 'min', 'max') 31 | 32 | columns = [] 33 | for name, size in feature_sizes.items(): 34 | for moment in moments: 35 | it = ((name, moment, '{:02d}'.format(i+1)) for i in range(size)) 36 | columns.extend(it) 37 | 38 | names = ('feature', 'statistics', 'number') 39 | columns = pd.MultiIndex.from_tuples(columns, names=names) 40 | 41 | # More efficient to slice if indexes are sorted. 42 | return columns.sort_values() 43 | 44 | 45 | def compute_features(tid): 46 | 47 | features = pd.Series(index=columns(), dtype=np.float32, name=tid) 48 | 49 | # Catch warnings as exceptions (audioread leaks file descriptors). 50 | warnings.filterwarnings('error', module='librosa') 51 | 52 | def feature_stats(name, values): 53 | features[name, 'mean'] = np.mean(values, axis=1) 54 | features[name, 'std'] = np.std(values, axis=1) 55 | features[name, 'skew'] = stats.skew(values, axis=1) 56 | features[name, 'kurtosis'] = stats.kurtosis(values, axis=1) 57 | features[name, 'median'] = np.median(values, axis=1) 58 | features[name, 'min'] = np.min(values, axis=1) 59 | features[name, 'max'] = np.max(values, axis=1) 60 | 61 | try: 62 | filepath = utils.get_audio_path(os.environ.get('AUDIO_DIR'), tid) 63 | x, sr = librosa.load(filepath, sr=None, mono=True) # kaiser_fast 64 | 65 | f = librosa.feature.zero_crossing_rate(x, frame_length=2048, hop_length=512) 66 | feature_stats('zcr', f) 67 | 68 | cqt = np.abs(librosa.cqt(x, sr=sr, hop_length=512, bins_per_octave=12, 69 | n_bins=7*12, tuning=None)) 70 | assert cqt.shape[0] == 7 * 12 71 | assert np.ceil(len(x)/512) <= cqt.shape[1] <= np.ceil(len(x)/512)+1 72 | 73 | f = librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7) 74 | feature_stats('chroma_cqt', f) 75 | f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7) 76 | feature_stats('chroma_cens', f) 77 | f = librosa.feature.tonnetz(chroma=f) 78 | feature_stats('tonnetz', f) 79 | 80 | del cqt 81 | stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512)) 82 | assert stft.shape[0] == 1 + 2048 // 2 83 | assert np.ceil(len(x)/512) <= stft.shape[1] <= np.ceil(len(x)/512)+1 84 | del x 85 | 86 | f = librosa.feature.chroma_stft(S=stft**2, n_chroma=12) 87 | feature_stats('chroma_stft', f) 88 | 89 | f = librosa.feature.rmse(S=stft) 90 | feature_stats('rmse', f) 91 | 92 | f = librosa.feature.spectral_centroid(S=stft) 93 | feature_stats('spectral_centroid', f) 94 | f = librosa.feature.spectral_bandwidth(S=stft) 95 | feature_stats('spectral_bandwidth', f) 96 | f = librosa.feature.spectral_contrast(S=stft, n_bands=6) 97 | feature_stats('spectral_contrast', f) 98 | f = librosa.feature.spectral_rolloff(S=stft) 99 | feature_stats('spectral_rolloff', f) 100 | 101 | mel = librosa.feature.melspectrogram(sr=sr, S=stft**2) 102 | del stft 103 | f = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20) 104 | feature_stats('mfcc', f) 105 | 106 | except Exception as e: 107 | print('{}: {}'.format(tid, repr(e))) 108 | 109 | return features 110 | 111 | 112 | def main(): 113 | tracks = utils.load('tracks.csv') 114 | features = pd.DataFrame(index=tracks.index, 115 | columns=columns(), dtype=np.float32) 116 | 117 | # More than usable CPUs to be CPU bound, not I/O bound. Beware memory. 118 | nb_workers = int(1.5 * len(os.sched_getaffinity(0))) 119 | 120 | # Longest is ~11,000 seconds. Limit processes to avoid memory errors. 121 | table = ((5000, 1), (3000, 3), (2000, 5), (1000, 10), (0, nb_workers)) 122 | for duration, nb_workers in table: 123 | print('Working with {} processes.'.format(nb_workers)) 124 | 125 | tids = tracks[tracks['track', 'duration'] >= duration].index 126 | tracks.drop(tids, axis=0, inplace=True) 127 | 128 | pool = multiprocessing.Pool(nb_workers) 129 | it = pool.imap_unordered(compute_features, tids) 130 | 131 | for i, row in enumerate(tqdm(it, total=len(tids))): 132 | features.loc[row.name] = row 133 | 134 | if i % 1000 == 0: 135 | save(features, 10) 136 | 137 | save(features, 10) 138 | test(features, 10) 139 | 140 | 141 | def save(features, ndigits): 142 | 143 | # Should be done already, just to be sure. 144 | features.sort_index(axis=0, inplace=True) 145 | features.sort_index(axis=1, inplace=True) 146 | 147 | features.to_csv('features.csv', float_format='%.{}e'.format(ndigits)) 148 | 149 | 150 | def test(features, ndigits): 151 | 152 | indices = features[features.isnull().any(axis=1)].index 153 | if len(indices) > 0: 154 | print('Failed tracks: {}'.format(', '.join(str(i) for i in indices))) 155 | 156 | tmp = utils.load('features.csv') 157 | np.testing.assert_allclose(tmp.values, features.values, rtol=10**-ndigits) 158 | 159 | 160 | if __name__ == "__main__": 161 | main() 162 | -------------------------------------------------------------------------------- /creation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # FMA: A Dataset For Music Analysis 4 | # Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2. 5 | 6 | import os 7 | import sys 8 | import shutil 9 | import pickle 10 | import zipfile 11 | import subprocess as sp 12 | from datetime import datetime 13 | from tqdm import tqdm, trange 14 | import pandas as pd 15 | import utils 16 | 17 | 18 | TIME = datetime(2017, 4, 1).timestamp() 19 | 20 | README = """This .zip archive is part of the FMA, a dataset for music analysis. 21 | Code & data: https://github.com/mdeff/fma 22 | Paper: https://arxiv.org/abs/1612.01840 23 | 24 | Each .mp3 is licensed by its artist. 25 | 26 | The content's integrity can be verified with sha1sum -c checksums. 27 | """ 28 | 29 | 30 | def download_metadata(): 31 | 32 | fma = utils.FreeMusicArchive(os.environ.get('FMA_KEY')) 33 | 34 | max_tid = int(fma.get_recent_tracks()[0][0]) 35 | print('Largest track id: {}'.format(max_tid)) 36 | 37 | not_found = {} 38 | 39 | id_range = trange(max_tid, desc='tracks') 40 | tracks, not_found['tracks'] = fma.get_all('track', id_range) 41 | 42 | id_range = tqdm(tracks['album_id'].unique(), desc='albums') 43 | albums, not_found['albums'] = fma.get_all('album', id_range) 44 | 45 | id_range = tqdm(tracks['artist_id'].unique(), desc='artists') 46 | artists, not_found['artists'] = fma.get_all('artist', id_range) 47 | 48 | genres = fma.get_all_genres() 49 | 50 | for dataset in 'tracks', 'albums', 'artists', 'genres': 51 | eval(dataset).sort_index(axis=0, inplace=True) 52 | eval(dataset).sort_index(axis=1, inplace=True) 53 | eval(dataset).to_csv('raw_' + dataset + '.csv') 54 | 55 | pickle.dump(not_found, open('not_found.pickle', 'wb')) 56 | 57 | 58 | def _create_subdirs(dst_dir, tracks): 59 | 60 | # Get write access. 61 | if not os.path.exists(dst_dir): 62 | os.makedirs(dst_dir) 63 | os.chmod(dst_dir, 0o777) 64 | 65 | # Create writable sub-directories. 66 | n_folders = max(tracks.index) // 1000 + 1 67 | for folder in range(n_folders): 68 | dst = os.path.join(dst_dir, '{:03d}'.format(folder)) 69 | if not os.path.exists(dst): 70 | os.makedirs(dst) 71 | os.chmod(dst, 0o777) 72 | 73 | 74 | def download_data(dst_dir): 75 | 76 | dst_dir = os.path.abspath(dst_dir) 77 | tracks = pd.read_csv('raw_tracks.csv', index_col=0) 78 | _create_subdirs(dst_dir, tracks) 79 | 80 | fma = utils.FreeMusicArchive(os.environ.get('FMA_KEY')) 81 | not_found = pickle.load(open('not_found.pickle', 'rb')) 82 | not_found['audio'] = [] 83 | 84 | # Download missing tracks. 85 | for tid in tqdm(tracks.index): 86 | dst = utils.get_audio_path(dst_dir, tid) 87 | if not os.path.exists(dst): 88 | try: 89 | fma.download_track(tracks.at[tid, 'track_file'], dst) 90 | except: # requests.HTTPError 91 | not_found['audio'].append(tid) 92 | 93 | pickle.dump(not_found, open('not_found.pickle', 'wb')) 94 | 95 | 96 | def convert_duration(x): 97 | times = x.split(':') 98 | seconds = int(times[-1]) 99 | minutes = int(times[-2]) 100 | try: 101 | minutes += 60 * int(times[-3]) 102 | except IndexError: 103 | pass 104 | return seconds + 60 * minutes 105 | 106 | 107 | def trim_audio(dst_dir): 108 | 109 | dst_dir = os.path.abspath(dst_dir) 110 | fma_full = os.path.join(dst_dir, 'fma_full') 111 | fma_large = os.path.join(dst_dir, 'fma_large') 112 | tracks = pd.read_csv('raw_tracks.csv', index_col=0) 113 | _create_subdirs(fma_large, tracks) 114 | 115 | not_found = pickle.load(open('not_found.pickle', 'rb')) 116 | not_found['clips'] = [] 117 | 118 | for tid in tqdm(tracks.index): 119 | duration = convert_duration(tracks.at[tid, 'track_duration']) 120 | src = utils.get_audio_path(fma_full, tid) 121 | dst = utils.get_audio_path(fma_large, tid) 122 | if tid in not_found['audio']: 123 | continue 124 | elif os.path.exists(dst): 125 | continue 126 | elif duration <= 30: 127 | shutil.copyfile(src, dst) 128 | else: 129 | start = duration // 2 - 15 130 | command = ['ffmpeg', '-i', src, 131 | '-ss', str(start), '-t', '30', 132 | '-acodec', 'copy', dst] 133 | try: 134 | sp.run(command, check=True, stderr=sp.DEVNULL) 135 | except sp.CalledProcessError: 136 | not_found['clips'].append(tid) 137 | 138 | for tid in not_found['clips']: 139 | try: 140 | os.remove(utils.get_audio_path(fma_large, tid)) 141 | except FileNotFoundError: 142 | pass 143 | 144 | pickle.dump(not_found, open('not_found.pickle', 'wb')) 145 | 146 | 147 | def normalize_permissions_times(dst_dir): 148 | dst_dir = os.path.abspath(dst_dir) 149 | for dirpath, dirnames, filenames in tqdm(os.walk(dst_dir)): 150 | for name in filenames: 151 | dst = os.path.join(dirpath, name) 152 | os.chmod(dst, 0o444) 153 | os.utime(dst, (TIME, TIME)) 154 | for name in dirnames: 155 | dst = os.path.join(dirpath, name) 156 | os.chmod(dst, 0o555) 157 | os.utime(dst, (TIME, TIME)) 158 | 159 | 160 | def create_zips(dst_dir): 161 | 162 | def get_filepaths(subset): 163 | filepaths = [] 164 | tids = tracks.index[tracks['set', 'subset'] <= subset] 165 | for tid in tids: 166 | filepaths.append(utils.get_audio_path('', tid)) 167 | return filepaths 168 | 169 | def get_checksums(base_dir, filepaths): 170 | """Checksums are assumed to be stored in order for efficiency.""" 171 | checksums = [] 172 | with open(os.path.join(dst_dir, base_dir, 'checksums')) as f: 173 | for filepath in filepaths: 174 | exist = False 175 | for line in f: 176 | if filepath == line[42:-1]: 177 | exist = True 178 | break 179 | if not exist: 180 | raise ValueError('checksum not found: {}'.format(filepath)) 181 | checksums.append(line) 182 | return checksums 183 | 184 | def create_zip(zip_filename, base_dir, filepaths): 185 | 186 | # Audio: all compressions are the same. 187 | # CSV: stored > deflated > BZIP2 > LZMA. 188 | # LZMA is close to BZIP2 and too recent to be widely available (unzip). 189 | compression = zipfile.ZIP_BZIP2 190 | 191 | zip_filepath = os.path.join(dst_dir, zip_filename) 192 | with zipfile.ZipFile(zip_filepath, 'x', compression) as zf: 193 | 194 | def info(name): 195 | name = os.path.join(zip_filename[:-4], name) 196 | info = zipfile.ZipInfo(name, (2017, 4, 1, 0, 0, 0)) 197 | info.external_attr = 0o444 << 16 | 0o2 << 30 198 | return info 199 | 200 | zf.writestr(info('README.txt'), README, compression) 201 | 202 | checksums = get_checksums(base_dir, filepaths) 203 | zf.writestr(info('checksums'), ''.join(checksums), compression) 204 | 205 | for filepath in tqdm(filepaths): 206 | src = os.path.join(dst_dir, base_dir, filepath) 207 | dst = os.path.join(zip_filename[:-4], filepath) 208 | zf.write(src, dst) 209 | 210 | os.chmod(zip_filepath, 0o444) 211 | os.utime(zip_filepath, (TIME, TIME)) 212 | 213 | METADATA = [ 214 | 'not_found.pickle', 215 | 'raw_genres.csv', 'raw_albums.csv', 216 | 'raw_artists.csv', 'raw_tracks.csv', 217 | 'tracks.csv', 'genres.csv', 218 | 'raw_echonest.csv', 'echonest.csv', 'features.csv', 219 | ] 220 | create_zip('fma_metadata.zip', 'fma_metadata', METADATA) 221 | 222 | tracks = utils.load('tracks.csv') 223 | create_zip('fma_small.zip', 'fma_large', get_filepaths('small')) 224 | create_zip('fma_medium.zip', 'fma_large', get_filepaths('medium')) 225 | create_zip('fma_large.zip', 'fma_large', get_filepaths('large')) 226 | create_zip('fma_full.zip', 'fma_full', get_filepaths('large')) 227 | 228 | 229 | if __name__ == "__main__": 230 | if sys.argv[1] == 'metadata': 231 | download_metadata() 232 | elif sys.argv[1] == 'data': 233 | download_data(sys.argv[2]) 234 | elif sys.argv[1] == 'clips': 235 | trim_audio(sys.argv[2]) 236 | elif sys.argv[1] == 'normalize': 237 | normalize_permissions_times(sys.argv[2]) 238 | elif sys.argv[1] == 'zips': 239 | create_zips(sys.argv[2]) 240 | -------------------------------------------------------------------------------- /usage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n", 8 | "\n", 9 | "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n", 10 | "\n", 11 | "## Usage\n", 12 | "\n", 13 | "1. Go through the [paper] to understand what the data is about.\n", 14 | "1. Download some datasets from .\n", 15 | "1. Uncompress the archives, e.g. with `unzip fma_small.zip`.\n", 16 | "1. Load and play with the data in this notebook.\n", 17 | "\n", 18 | "[paper]: https://arxiv.org/abs/1612.01840" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "%matplotlib inline\n", 28 | "\n", 29 | "import os\n", 30 | "\n", 31 | "import IPython.display as ipd\n", 32 | "import numpy as np\n", 33 | "import pandas as pd\n", 34 | "import matplotlib.pyplot as plt\n", 35 | "import seaborn as sns\n", 36 | "import sklearn as skl\n", 37 | "import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm\n", 38 | "import librosa\n", 39 | "import librosa.display\n", 40 | "\n", 41 | "import utils\n", 42 | "\n", 43 | "plt.rcParams['figure.figsize'] = (17, 5)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# Directory where mp3 are stored.\n", 53 | "AUDIO_DIR = os.environ.get('AUDIO_DIR')\n", 54 | "\n", 55 | "# Load metadata and features.\n", 56 | "tracks = utils.load('data/fma_metadata/tracks.csv')\n", 57 | "genres = utils.load('data/fma_metadata/genres.csv')\n", 58 | "features = utils.load('data/fma_metadata/features.csv')\n", 59 | "echonest = utils.load('data/fma_metadata/echonest.csv')\n", 60 | "\n", 61 | "np.testing.assert_array_equal(features.index, tracks.index)\n", 62 | "assert echonest.index.isin(tracks.index).all()\n", 63 | "\n", 64 | "tracks.shape, genres.shape, features.shape, echonest.shape" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## 1 Metadata\n", 72 | "\n", 73 | "The metadata table, a CSV file in the `fma_metadata.zip` archive, is composed of many colums:\n", 74 | "1. The index is the ID of the song, taken from the website, used as the name of the audio file.\n", 75 | "2. Per-track, per-album and per-artist metadata from the Free Music Archive website.\n", 76 | "3. Two columns to indicate the subset (small, medium, large) and the split (training, validation, test)." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "ipd.display(tracks['track'].head())\n", 86 | "ipd.display(tracks['album'].head())\n", 87 | "ipd.display(tracks['artist'].head())\n", 88 | "ipd.display(tracks['set'].head())" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "### 1.1 Subsets\n", 96 | "\n", 97 | "The small and medium subsets can be selected with the below code." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "small = tracks[tracks['set', 'subset'] <= 'small']\n", 107 | "small.shape" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "medium = tracks[tracks['set', 'subset'] <= 'medium']\n", 117 | "medium.shape" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## 2 Genres\n", 125 | "\n", 126 | "The genre hierarchy is stored in `genres.csv` and distributed in `fma_metadata.zip`." 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "print('{} top-level genres'.format(len(genres['top_level'].unique())))\n", 136 | "genres.loc[genres['top_level'].unique()].sort_values('#tracks', ascending=False)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "genres.sort_values('#tracks').head(10)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "## 3 Features\n", 153 | "\n", 154 | "1. Features extracted from the audio for all tracks.\n", 155 | "2. For some tracks, data colected from the [Echonest](http://the.echonest.com/) API." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "print('{1} features for {0} tracks'.format(*features.shape))\n", 165 | "columns = ['mfcc', 'chroma_cens', 'tonnetz', 'spectral_contrast']\n", 166 | "columns.append(['spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff'])\n", 167 | "columns.append(['rmse', 'zcr'])\n", 168 | "for column in columns:\n", 169 | " ipd.display(features[column].head().style.format('{:.2f}'))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "### 3.1 Echonest features" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "print('{1} features for {0} tracks'.format(*echonest.shape))\n", 186 | "ipd.display(echonest['echonest', 'metadata'].head())\n", 187 | "ipd.display(echonest['echonest', 'audio_features'].head())\n", 188 | "ipd.display(echonest['echonest', 'social_features'].head())\n", 189 | "ipd.display(echonest['echonest', 'ranks'].head())" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "ipd.display(echonest['echonest', 'temporal_features'].head())\n", 199 | "x = echonest.loc[2, ('echonest', 'temporal_features')]\n", 200 | "plt.plot(x);" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### 3.2 Features like MFCCs are discriminant" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "small = tracks['set', 'subset'] <= 'small'\n", 217 | "genre1 = tracks['track', 'genre_top'] == 'Instrumental'\n", 218 | "genre2 = tracks['track', 'genre_top'] == 'Hip-Hop'\n", 219 | "\n", 220 | "X = features.loc[small & (genre1 | genre2), 'mfcc']\n", 221 | "X = skl.decomposition.PCA(n_components=2).fit_transform(X)\n", 222 | "\n", 223 | "y = tracks.loc[small & (genre1 | genre2), ('track', 'genre_top')]\n", 224 | "y = skl.preprocessing.LabelEncoder().fit_transform(y)\n", 225 | "\n", 226 | "plt.scatter(X[:,0], X[:,1], c=y, cmap='RdBu', alpha=0.5)\n", 227 | "X.shape, y.shape" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "## 4 Audio\n", 235 | "\n", 236 | "You can load the waveform and listen to audio in the notebook itself." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "filename = utils.get_audio_path(AUDIO_DIR, 2)\n", 246 | "print('File: {}'.format(filename))\n", 247 | "\n", 248 | "x, sr = librosa.load(filename, sr=None, mono=True)\n", 249 | "print('Duration: {:.2f}s, {} samples'.format(x.shape[-1] / sr, x.size))\n", 250 | "\n", 251 | "start, end = 7, 17\n", 252 | "ipd.Audio(data=x[start*sr:end*sr], rate=sr)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "And use [librosa](https://github.com/librosa/librosa) to compute spectrograms and audio features." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "librosa.display.waveplot(x, sr, alpha=0.5);\n", 269 | "plt.vlines([start, end], -1, 1)\n", 270 | "\n", 271 | "start = len(x) // 2\n", 272 | "plt.figure()\n", 273 | "plt.plot(x[start:start+2000])\n", 274 | "plt.ylim((-1, 1));" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))\n", 284 | "mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)\n", 285 | "log_mel = librosa.logamplitude(mel)\n", 286 | "\n", 287 | "librosa.display.specshow(log_mel, sr=sr, hop_length=512, x_axis='time', y_axis='mel');" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)\n", 297 | "mfcc = skl.preprocessing.StandardScaler().fit_transform(mfcc)\n", 298 | "librosa.display.specshow(mfcc, sr=sr, x_axis='time');" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "## 5 Genre classification" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### 5.1 From features" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "small = tracks['set', 'subset'] <= 'small'\n", 322 | "\n", 323 | "train = tracks['set', 'split'] == 'training'\n", 324 | "val = tracks['set', 'split'] == 'validation'\n", 325 | "test = tracks['set', 'split'] == 'test'\n", 326 | "\n", 327 | "y_train = tracks.loc[small & train, ('track', 'genre_top')]\n", 328 | "y_test = tracks.loc[small & test, ('track', 'genre_top')]\n", 329 | "X_train = features.loc[small & train, 'mfcc']\n", 330 | "X_test = features.loc[small & test, 'mfcc']\n", 331 | "\n", 332 | "print('{} training examples, {} testing examples'.format(y_train.size, y_test.size))\n", 333 | "print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "# Be sure training samples are shuffled.\n", 343 | "X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)\n", 344 | "\n", 345 | "# Standardize features by removing the mean and scaling to unit variance.\n", 346 | "scaler = skl.preprocessing.StandardScaler(copy=False)\n", 347 | "scaler.fit_transform(X_train)\n", 348 | "scaler.transform(X_test)\n", 349 | "\n", 350 | "# Support vector classification.\n", 351 | "clf = skl.svm.SVC()\n", 352 | "clf.fit(X_train, y_train)\n", 353 | "score = clf.score(X_test, y_test)\n", 354 | "print('Accuracy: {:.2%}'.format(score))" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "### 5.2 From audio" 362 | ] 363 | } 364 | ], 365 | "metadata": {}, 366 | "nbformat": 4, 367 | "nbformat_minor": 2 368 | } 369 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import dotenv 2 | import pydot 3 | import requests 4 | import numpy as np 5 | import pandas as pd 6 | import ctypes 7 | import shutil 8 | import multiprocessing 9 | import multiprocessing.sharedctypes as sharedctypes 10 | import os.path 11 | import ast 12 | 13 | 14 | # Number of samples per 30s audio clip. 15 | # TODO: fix dataset to be constant. 16 | NB_AUDIO_SAMPLES = 1321967 17 | SAMPLING_RATE = 44100 18 | 19 | # Load the environment from the .env file. 20 | dotenv.load_dotenv(dotenv.find_dotenv()) 21 | 22 | 23 | class FreeMusicArchive: 24 | 25 | BASE_URL = 'https://freemusicarchive.org/api/get/' 26 | 27 | def __init__(self, api_key): 28 | self.api_key = api_key 29 | 30 | def get_recent_tracks(self): 31 | URL = 'https://freemusicarchive.org/recent.json' 32 | r = requests.get(URL) 33 | r.raise_for_status() 34 | tracks = [] 35 | artists = [] 36 | date_created = [] 37 | for track in r.json()['aTracks']: 38 | tracks.append(track['track_id']) 39 | artists.append(track['artist_name']) 40 | date_created.append(track['track_date_created']) 41 | return tracks, artists, date_created 42 | 43 | def _get_data(self, dataset, fma_id, fields=None): 44 | url = self.BASE_URL + dataset + 's.json?' 45 | url += dataset + '_id=' + str(fma_id) + '&api_key=' + self.api_key 46 | # print(url) 47 | r = requests.get(url) 48 | r.raise_for_status() 49 | if r.json()['errors']: 50 | raise Exception(r.json()['errors']) 51 | data = r.json()['dataset'][0] 52 | r_id = data[dataset + '_id'] 53 | if r_id != str(fma_id): 54 | raise Exception('The received id {} does not correspond to' 55 | 'the requested one {}'.format(r_id, fma_id)) 56 | if fields is None: 57 | return data 58 | if type(fields) is list: 59 | ret = {} 60 | for field in fields: 61 | ret[field] = data[field] 62 | return ret 63 | else: 64 | return data[fields] 65 | 66 | def get_track(self, track_id, fields=None): 67 | return self._get_data('track', track_id, fields) 68 | 69 | def get_album(self, album_id, fields=None): 70 | return self._get_data('album', album_id, fields) 71 | 72 | def get_artist(self, artist_id, fields=None): 73 | return self._get_data('artist', artist_id, fields) 74 | 75 | def get_all(self, dataset, id_range): 76 | index = dataset + '_id' 77 | 78 | id_ = 2 if dataset == 'track' else 1 79 | row = self._get_data(dataset, id_) 80 | df = pd.DataFrame(columns=row.keys()) 81 | df.set_index(index, inplace=True) 82 | 83 | not_found_ids = [] 84 | 85 | for id_ in id_range: 86 | try: 87 | row = self._get_data(dataset, id_) 88 | except: 89 | not_found_ids.append(id_) 90 | continue 91 | row.pop(index) 92 | df = df.append(pd.Series(row, name=id_)) 93 | 94 | return df, not_found_ids 95 | 96 | def download_track(self, track_file, path): 97 | url = 'https://files.freemusicarchive.org/' + track_file 98 | r = requests.get(url, stream=True) 99 | r.raise_for_status() 100 | with open(path, 'wb') as f: 101 | shutil.copyfileobj(r.raw, f) 102 | 103 | def get_track_genres(self, track_id): 104 | genres = self.get_track(track_id, 'track_genres') 105 | genre_ids = [] 106 | genre_titles = [] 107 | for genre in genres: 108 | genre_ids.append(genre['genre_id']) 109 | genre_titles.append(genre['genre_title']) 110 | return genre_ids, genre_titles 111 | 112 | def get_all_genres(self): 113 | df = pd.DataFrame(columns=['genre_parent_id', 'genre_title', 114 | 'genre_handle', 'genre_color']) 115 | df.index.rename('genre_id', inplace=True) 116 | 117 | page = 1 118 | while True: 119 | url = self.BASE_URL + 'genres.json?limit=50' 120 | url += '&page={}&api_key={}'.format(page, self.api_key) 121 | r = requests.get(url) 122 | for genre in r.json()['dataset']: 123 | genre_id = int(genre.pop(df.index.name)) 124 | df.loc[genre_id] = genre 125 | assert (r.json()['page'] == str(page)) 126 | page += 1 127 | if page > r.json()['total_pages']: 128 | break 129 | 130 | return df 131 | 132 | 133 | class Genres: 134 | 135 | def __init__(self, genres_df): 136 | self.df = genres_df 137 | 138 | def create_tree(self, roots, depth=None): 139 | 140 | if type(roots) is not list: 141 | roots = [roots] 142 | graph = pydot.Dot(graph_type='digraph', strict=True) 143 | 144 | def create_node(genre_id): 145 | title = self.df.at[genre_id, 'title'] 146 | ntracks = self.df.at[genre_id, '#tracks'] 147 | # name = self.df.at[genre_id, 'title'] + '\n' + str(genre_id) 148 | name = '"{}\n{} / {}"'.format(title, genre_id, ntracks) 149 | return pydot.Node(name) 150 | 151 | def create_tree(root_id, node_p, depth): 152 | if depth == 0: 153 | return 154 | children = self.df[self.df['parent'] == root_id] 155 | for child in children.iterrows(): 156 | genre_id = child[0] 157 | node_c = create_node(genre_id) 158 | graph.add_edge(pydot.Edge(node_p, node_c)) 159 | create_tree(genre_id, node_c, 160 | depth-1 if depth is not None else None) 161 | 162 | for root in roots: 163 | node_p = create_node(root) 164 | graph.add_node(node_p) 165 | create_tree(root, node_p, depth) 166 | 167 | return graph 168 | 169 | def find_roots(self): 170 | roots = [] 171 | for gid, row in self.df.iterrows(): 172 | parent = row['parent'] 173 | title = row['title'] 174 | if parent == 0: 175 | roots.append(gid) 176 | elif parent not in self.df.index: 177 | msg = '{} ({}) has parent {} which is missing'.format( 178 | gid, title, parent) 179 | raise RuntimeError(msg) 180 | return roots 181 | 182 | 183 | def load(filepath): 184 | 185 | filename = os.path.basename(filepath) 186 | 187 | if 'features' in filename: 188 | return pd.read_csv(filepath, index_col=0, header=[0, 1, 2]) 189 | 190 | if 'echonest' in filename: 191 | return pd.read_csv(filepath, index_col=0, header=[0, 1, 2]) 192 | 193 | if 'genres' in filename: 194 | return pd.read_csv(filepath, index_col=0) 195 | 196 | if 'tracks' in filename: 197 | tracks = pd.read_csv(filepath, index_col=0, header=[0, 1]) 198 | 199 | COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'), 200 | ('track', 'genres'), ('track', 'genres_all')] 201 | for column in COLUMNS: 202 | tracks[column] = tracks[column].map(ast.literal_eval) 203 | 204 | COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'), 205 | ('album', 'date_created'), ('album', 'date_released'), 206 | ('artist', 'date_created'), ('artist', 'active_year_begin'), 207 | ('artist', 'active_year_end')] 208 | for column in COLUMNS: 209 | tracks[column] = pd.to_datetime(tracks[column]) 210 | 211 | SUBSETS = ('small', 'medium', 'large') 212 | try: 213 | tracks['set', 'subset'] = tracks['set', 'subset'].astype( 214 | 'category', categories=SUBSETS, ordered=True) 215 | except (ValueError, TypeError): 216 | # the categories and ordered arguments were removed in pandas 0.25 217 | tracks['set', 'subset'] = tracks['set', 'subset'].astype( 218 | pd.CategoricalDtype(categories=SUBSETS, ordered=True)) 219 | 220 | COLUMNS = [('track', 'genre_top'), ('track', 'license'), 221 | ('album', 'type'), ('album', 'information'), 222 | ('artist', 'bio')] 223 | for column in COLUMNS: 224 | tracks[column] = tracks[column].astype('category') 225 | 226 | return tracks 227 | 228 | 229 | def get_audio_path(audio_dir, track_id): 230 | """ 231 | Return the path to the mp3 given the directory where the audio is stored 232 | and the track ID. 233 | 234 | Examples 235 | -------- 236 | >>> import utils 237 | >>> AUDIO_DIR = os.environ.get('AUDIO_DIR') 238 | >>> utils.get_audio_path(AUDIO_DIR, 2) 239 | '../data/fma_small/000/000002.mp3' 240 | 241 | """ 242 | tid_str = '{:06d}'.format(track_id) 243 | return os.path.join(audio_dir, tid_str[:3], tid_str + '.mp3') 244 | 245 | 246 | class Loader: 247 | def load(self, filepath): 248 | raise NotImplementedError() 249 | 250 | 251 | class RawAudioLoader(Loader): 252 | def __init__(self, sampling_rate=SAMPLING_RATE): 253 | self.sampling_rate = sampling_rate 254 | self.shape = (NB_AUDIO_SAMPLES * sampling_rate // SAMPLING_RATE, ) 255 | 256 | def load(self, filepath): 257 | return self._load(filepath)[:self.shape[0]] 258 | 259 | 260 | class LibrosaLoader(RawAudioLoader): 261 | def _load(self, filepath): 262 | import librosa 263 | sr = self.sampling_rate if self.sampling_rate != SAMPLING_RATE else None 264 | # kaiser_fast is 3x faster than kaiser_best 265 | # x, sr = librosa.load(filepath, sr=sr, res_type='kaiser_fast') 266 | x, sr = librosa.load(filepath, sr=sr) 267 | return x 268 | 269 | 270 | class AudioreadLoader(RawAudioLoader): 271 | def _load(self, filepath): 272 | import audioread 273 | a = audioread.audio_open(filepath) 274 | a.read_data() 275 | 276 | 277 | class PydubLoader(RawAudioLoader): 278 | def _load(self, filepath): 279 | from pydub import AudioSegment 280 | song = AudioSegment.from_file(filepath) 281 | song = song.set_channels(1) 282 | x = song.get_array_of_samples() 283 | # print(filepath) if song.channels != 2 else None 284 | return np.array(x) 285 | 286 | 287 | class FfmpegLoader(RawAudioLoader): 288 | def _load(self, filepath): 289 | """Fastest and less CPU intensive loading method.""" 290 | import subprocess as sp 291 | command = ['ffmpeg', 292 | '-i', filepath, 293 | '-f', 's16le', 294 | '-acodec', 'pcm_s16le', 295 | '-ac', '1'] # channels: 2 for stereo, 1 for mono 296 | if self.sampling_rate != SAMPLING_RATE: 297 | command.extend(['-ar', str(self.sampling_rate)]) 298 | command.append('-') 299 | # 30s at 44.1 kHz ~= 1.3e6 300 | proc = sp.run(command, stdout=sp.PIPE, bufsize=10**7, stderr=sp.DEVNULL, check=True) 301 | 302 | return np.fromstring(proc.stdout, dtype="int16") 303 | 304 | 305 | def build_sample_loader(audio_dir, Y, loader): 306 | 307 | class SampleLoader: 308 | 309 | def __init__(self, tids, batch_size=4): 310 | self.lock1 = multiprocessing.Lock() 311 | self.lock2 = multiprocessing.Lock() 312 | self.batch_foremost = sharedctypes.RawValue(ctypes.c_int, 0) 313 | self.batch_rearmost = sharedctypes.RawValue(ctypes.c_int, -1) 314 | self.condition = multiprocessing.Condition(lock=self.lock2) 315 | 316 | data = sharedctypes.RawArray(ctypes.c_int, tids.data) 317 | self.tids = np.ctypeslib.as_array(data) 318 | 319 | self.batch_size = batch_size 320 | self.loader = loader 321 | self.X = np.empty((self.batch_size, *loader.shape)) 322 | self.Y = np.empty((self.batch_size, Y.shape[1]), dtype=np.int) 323 | 324 | def __iter__(self): 325 | return self 326 | 327 | def __next__(self): 328 | 329 | with self.lock1: 330 | if self.batch_foremost.value == 0: 331 | np.random.shuffle(self.tids) 332 | 333 | batch_current = self.batch_foremost.value 334 | if self.batch_foremost.value + self.batch_size < self.tids.size: 335 | batch_size = self.batch_size 336 | self.batch_foremost.value += self.batch_size 337 | else: 338 | batch_size = self.tids.size - self.batch_foremost.value 339 | self.batch_foremost.value = 0 340 | 341 | # print(self.tids, self.batch_foremost.value, batch_current, self.tids[batch_current], batch_size) 342 | # print('queue', self.tids[batch_current], batch_size) 343 | tids = np.array(self.tids[batch_current:batch_current+batch_size]) 344 | 345 | batch_size = 0 346 | for tid in tids: 347 | try: 348 | audio_path = get_audio_path(audio_dir, tid) 349 | self.X[batch_size] = self.loader.load(audio_path) 350 | self.Y[batch_size] = Y.loc[tid] 351 | batch_size += 1 352 | except Exception as e: 353 | print("\nIgnoring " + audio_path +" (error: " + str(e) +").") 354 | 355 | with self.lock2: 356 | while (batch_current - self.batch_rearmost.value) % self.tids.size > self.batch_size: 357 | # print('wait', indices[0], batch_current, self.batch_rearmost.value) 358 | self.condition.wait() 359 | self.condition.notify_all() 360 | # print('yield', indices[0], batch_current, self.batch_rearmost.value) 361 | self.batch_rearmost.value = batch_current 362 | 363 | return self.X[:batch_size], self.Y[:batch_size] 364 | 365 | return SampleLoader 366 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FMA: A Dataset For Music Analysis 2 | 3 | [Michaël Defferrard](https://deff.ch), 4 | [Kirell Benzi](https://kirellbenzi.com), 5 | [Pierre Vandergheynst](https://people.epfl.ch/pierre.vandergheynst), 6 | [Xavier Bresson](https://www.ntu.edu.sg/home/xbresson). \ 7 | International Society for Music Information Retrieval Conference (ISMIR), 2017. 8 | 9 | > We introduce the Free Music Archive (FMA), an open and easily accessible 10 | > dataset suitable for evaluating several tasks in MIR, a field concerned with 11 | > browsing, searching, and organizing large music collections. The community's 12 | > growing interest in feature and end-to-end learning is however restrained by 13 | > the limited availability of large audio datasets. The FMA aims to overcome 14 | > this hurdle by providing 917 GiB and 343 days of Creative Commons-licensed 15 | > audio from 106,574 tracks from 16,341 artists and 14,854 albums, arranged in 16 | > a hierarchical taxonomy of 161 genres. It provides full-length and 17 | > high-quality audio, pre-computed features, together with track- and 18 | > user-level metadata, tags, and free-form text such as biographies. We here 19 | > describe the dataset and how it was created, propose a train/validation/test 20 | > split and three subsets, discuss some suitable MIR tasks, and evaluate some 21 | > baselines for genre recognition. Code, data, and usage examples are available 22 | > at . 23 | 24 | * Paper: [`arXiv:1612.01840`][paper] ([latex and reviews](https://github.com/mdeff/paper-fma-ismir2017)) 25 | * Slides: [`doi:10.5281/zenodo.1066119`](https://doi.org/10.5281/zenodo.1066119) 26 | * Poster: [`doi:10.5281/zenodo.1035847`](https://doi.org/10.5281/zenodo.1035847) 27 | 28 | [paper]: https://arxiv.org/abs/1612.01840 29 | [FMA]: https://freemusicarchive.org 30 | 31 | ## Data 32 | 33 | All metadata and features for all tracks are distributed in **[`fma_metadata.zip`]** (342 MiB). 34 | The below tables can be used with [pandas] or any other data analysis tool. 35 | See the [paper] or the [`usage.ipynb`] notebook for a description. 36 | * `tracks.csv`: per track metadata such as ID, title, artist, genres, tags and play counts, for all 106,574 tracks. 37 | * `genres.csv`: all 163 genres with name and parent (used to infer the genre hierarchy and top-level genres). 38 | * `features.csv`: common features extracted with [librosa]. 39 | * `echonest.csv`: audio features provided by [Echonest] (now [Spotify]) for a subset of 13,129 tracks. 40 | 41 | [pandas]: https://pandas.pydata.org/ 42 | [librosa]: https://librosa.org/ 43 | [spotify]: https://www.spotify.com/ 44 | [echonest]: https://web.archive.org/web/20170519050040/http://the.echonest.com/ 45 | 46 | Then, you got various sizes of MP3-encoded audio data: 47 | 48 | 1. **[`fma_small.zip`]**: 8,000 tracks of 30s, 8 balanced genres (GTZAN-like) (7.2 GiB) 49 | 2. **[`fma_medium.zip`]**: 25,000 tracks of 30s, 16 unbalanced genres (22 GiB) 50 | 3. **[`fma_large.zip`]**: 106,574 tracks of 30s, 161 unbalanced genres (93 GiB) 51 | 4. **[`fma_full.zip`]**: 106,574 untrimmed tracks, 161 unbalanced genres (879 GiB) 52 | 53 | [`fma_metadata.zip`]: https://os.unil.cloud.switch.ch/fma/fma_metadata.zip 54 | [`fma_small.zip`]: https://os.unil.cloud.switch.ch/fma/fma_small.zip 55 | [`fma_medium.zip`]: https://os.unil.cloud.switch.ch/fma/fma_medium.zip 56 | [`fma_large.zip`]: https://os.unil.cloud.switch.ch/fma/fma_large.zip 57 | [`fma_full.zip`]: https://os.unil.cloud.switch.ch/fma/fma_full.zip 58 | 59 | See the [wiki](https://github.com/mdeff/fma/wiki) (or [#41](https://github.com/mdeff/fma/issues/41)) for **known issues (errata)**. 60 | 61 | ## Code 62 | 63 | The following notebooks, scripts, and modules have been developed for the dataset. 64 | 65 | 1. [`usage.ipynb`]: shows how to load the datasets and develop, train, and test your own models with it. 66 | 2. [`analysis.ipynb`]: exploration of the metadata, data, and features. 67 | Creates the [figures](https://github.com/mdeff/fma/tree/outputs/figures) used in the paper. 68 | 3. [`baselines.ipynb`]: baseline models for genre recognition, both from audio and features. 69 | 4. [`features.py`]: features extraction from the audio (used to create `features.csv`). 70 | 5. [`webapi.ipynb`]: query the web API of the [FMA]. Can be used to update the dataset. 71 | 6. [`creation.ipynb`]: creation of the dataset (used to create `tracks.csv` and `genres.csv`). 72 | 7. [`creation.py`]: creation of the dataset (long-running data collection and processing). 73 | 8. [`utils.py`]: helper functions and classes. 74 | 75 | [`usage.ipynb`]: https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/usage.ipynb 76 | [`analysis.ipynb`]: https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/analysis.ipynb 77 | [`baselines.ipynb`]: https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/baselines.ipynb 78 | [`features.py`]: features.py 79 | [`webapi.ipynb`]: https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/webapi.ipynb 80 | [`creation.ipynb`]: https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/creation.ipynb 81 | [`creation.py`]: creation.py 82 | [`utils.py`]: utils.py 83 | 84 | ## Usage 85 | 86 | [![Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/mdeff/fma/outputs?urlpath=lab/tree/usage.ipynb) 87 |   Click the binder badge to play with the code and data from your browser without installing anything. 88 | 89 | 1. Clone the repository. 90 | ```sh 91 | git clone https://github.com/mdeff/fma.git 92 | cd fma 93 | ``` 94 | 95 | 1.
Create a Python 3.6 environment. 96 | 97 | ```sh 98 | # with https://conda.io 99 | conda create -n fma python=3.6 100 | conda activate fma 101 | 102 | # with https://github.com/pyenv/pyenv 103 | pyenv install 3.6.0 104 | pyenv virtualenv 3.6.0 fma 105 | pyenv activate fma 106 | 107 | # with https://pipenv.pypa.io 108 | pipenv --python 3.6 109 | pipenv shell 110 | 111 | # with https://docs.python.org/3/tutorial/venv.html 112 | python3.6 -m venv ./env 113 | source ./env/bin/activate 114 | ``` 115 |
116 | 117 | 1. Install dependencies. 118 | ```sh 119 | pip install --upgrade pip setuptools wheel 120 | pip install numpy==1.12.1 # workaround resampy's bogus setup.py 121 | pip install -r requirements.txt 122 | ``` 123 | Note: you may need to install [ffmpeg](https://ffmpeg.org/download.html) or [graphviz](https://www.graphviz.org) depending on your usage.\ 124 | Note: install [CUDA](https://en.wikipedia.org/wiki/CUDA) to train neural networks on GPUs (see [Tensorflow's instructions](https://www.tensorflow.org/install/)). 125 | 126 | 1. Download some data, verify its integrity, and uncompress the archives. 127 | ```sh 128 | cd data 129 | 130 | curl -O https://os.unil.cloud.switch.ch/fma/fma_metadata.zip 131 | curl -O https://os.unil.cloud.switch.ch/fma/fma_small.zip 132 | curl -O https://os.unil.cloud.switch.ch/fma/fma_medium.zip 133 | curl -O https://os.unil.cloud.switch.ch/fma/fma_large.zip 134 | curl -O https://os.unil.cloud.switch.ch/fma/fma_full.zip 135 | 136 | echo "f0df49ffe5f2a6008d7dc83c6915b31835dfe733 fma_metadata.zip" | sha1sum -c - 137 | echo "ade154f733639d52e35e32f5593efe5be76c6d70 fma_small.zip" | sha1sum -c - 138 | echo "c67b69ea232021025fca9231fc1c7c1a063ab50b fma_medium.zip" | sha1sum -c - 139 | echo "497109f4dd721066b5ce5e5f250ec604dc78939e fma_large.zip" | sha1sum -c - 140 | echo "0f0ace23fbe9ba30ecb7e95f763e435ea802b8ab fma_full.zip" | sha1sum -c - 141 | 142 | unzip fma_metadata.zip 143 | unzip fma_small.zip 144 | unzip fma_medium.zip 145 | unzip fma_large.zip 146 | unzip fma_full.zip 147 | 148 | cd .. 149 | ``` 150 | 151 | Note: try [7zip](https://www.7-zip.org) if decompression errors. 152 | It might be an [unsupported compression issue](https://github.com/mdeff/fma/issues/5). 153 | 154 | 1. Fill a `.env` configuration file (at repository's root) with the following content. 155 | ``` 156 | AUDIO_DIR=./data/fma_small/ # the path to a decompressed fma_*.zip 157 | FMA_KEY=MYKEY # only if you want to query the freemusicarchive.org API 158 | ``` 159 | 160 | 1. Open Jupyter or run a notebook. 161 | ```sh 162 | jupyter notebook 163 | make usage.ipynb 164 | ``` 165 | 166 | ## Impact, coverage, and resources 167 | 168 |
100+ research papers 169 | 170 | Full list on [Google Scholar](https://scholar.google.com/scholar?cites=13646959466952873682,13785796238335741238,7544459641098681164,5736399534855095976). 171 | Some picks below. 172 | 173 | * [Zero-shot Learning for Audio-based Music Classification and Tagging](https://arxiv.org/abs/1907.02670) 174 | * [One deep music representation to rule them all? A comparative analysis of different representation learning strategies](https://doi.org/10.1007/s00521-019-04076-1) 175 | * [Deep Learning for Audio-Based Music Classification and Tagging: Teaching Computers to Distinguish Rock from Bach](https://sci-hub.tw/10.1109/MSP.2018.2874383) 176 | * [Learning Discrete Structures for Graph Neural Networks](https://arxiv.org/abs/1903.11960) 177 | * [A context encoder for audio inpainting](https://arxiv.org/abs/1810.12138) 178 | * [OpenMIC-2018: An Open Data-set for Multiple Instrument Recognition](https://archives.ismir.net/ismir2018/paper/000248.pdf) 179 | * [Detecting Music Genre Using Extreme Gradient Boosting](https://doi.org/10.1145/3184558.3191822) 180 | * [Transfer Learning of Artist Group Factors to Musical Genre Classification](https://doi.org/10.1145/3184558.3191823) 181 | * [Learning to Recognize Musical Genre from Audio: Challenge Overview](https://arxiv.org/abs/1803.05337) 182 | * [Representation Learning of Music Using Artist Labels](https://arxiv.org/abs/1710.06648) 183 | 184 |
185 | 186 |
2 derived works 187 | 188 | * [OpenMIC-2018: An Open Data-set for Multiple Instrument Recognition](https://github.com/cosmir/openmic-2018) 189 | * [ConvNet features](https://github.com/keunwoochoi/FMA_convnet_features) from [Transfer learning for music classification and regression tasks](https://arxiv.org/abs/1703.09179) 190 | 191 |
192 | 193 |
~10 posts 194 | 195 | * [Music Genre Classification With TensorFlow](https://towardsdatascience.com/music-genre-classification-with-tensorflow-3de38f0d4dbb), Towards Data Science, 2020-08-11. 196 | * [Music Genre Classification: Transformers vs Recurrent Neural Networks](https://towardsdatascience.com/music-genre-classification-transformers-vs-recurrent-neural-networks-631751a71c58), Towards Data Science, 2020-06-14. 197 | * [Using CNNs and RNNs for Music Genre Recognition](https://towardsdatascience.com/using-cnns-and-rnns-for-music-genre-recognition-2435fb2ed6af), Towards Data Science, 2018-12-13. 198 | * [Over 1.5 TB’s of Labeled Audio Datasets](https://towardsdatascience.com/a-data-lakes-worth-of-audio-datasets-b45b88cd4ad), Towards Data Science, 2018-11-13. 199 | * [Discovering Descriptive Music Genres Using K-Means Clustering](https://medium.com/latinxinai/discovering-descriptive-music-genres-using-k-means-clustering-d19bdea5e443), Medium, 2018-04-09. 200 | * [25 Open Datasets for Deep Learning Every Data Scientist Must Work With](https://www.analyticsvidhya.com/blog/2018/03/comprehensive-collection-deep-learning-datasets/), Analytics Vidhya, 2018-03-29. 201 | * [Learning Music Genres](https://medium.com/@diegoagher/learning-music-genres-5ab1cabadfed), Medium, 2017-12-13. 202 | * [music2vec: Generating Vector Embeddings for Genre-Classification Task](https://medium.com/@rajatheb/music2vec-generating-vector-embedding-for-genre-classification-task-411187a20820), Medium, 2017-11-28. 203 | * [A Music Information Retrieval Dataset, Made With FMA](https://web.archive.org/web/20190907182116/http://freemusicarchive.org/member/cheyenne_h/blog/A_Music_Information_Retrieval_Dataset_Made_With_FMA), freemusicarchive.org, 2017-05-22. 204 | * [Pre-publication release announced](https://twitter.com/m_deff/status/861985446116589569), twitter.com, 2017-05-09. 205 | * [FMA: A Dataset For Music Analysis](https://tensorflow.blog/2017/03/14/fma-a-dataset-for-music-analysis), tensorflow.blog, 2017-03-14. 206 | * [Beta release discussed](https://twitter.com/YadFaeq/status/829406463286063104), twitter.com, 2017-02-08. 207 | * [FMA Data Set for Researchers Released](https://web.archive.org/web/20190826112752/http://freemusicarchive.org/member/cheyenne_h/blog/FMA_Dataset_for_Researchers), freemusicarchive.org, 2016-12-15. 208 | 209 |
210 | 211 |
5 events 212 | 213 | * [Summer Workshop](https://hcdigitalscholarship.github.io/audio-files) by the [Haverford Digital Scholarship Library](https://www.haverford.edu/library/digital-scholarship), 2020-07. 214 | * [Genre recognition challenge](https://www.crowdai.org/challenges/www-2018-challenge-learning-to-recognize-musical-genre) at the [Web Conference](https://www2018.thewebconf.org/program/challenges-track/), Lyon, 2018-04. 215 | * [Slides](https://doi.org/10.5281/zenodo.1066119) presented at the [Data Jam days](http://datajamdays.org), Lausanne, 2017-11-24. 216 | * [Poster](https://doi.org/10.5281/zenodo.1035847) presented at [ISMIR 2017](https://ismir2017.ismir.net), Suzhou, 2017-10-24. 217 | * [Slides](https://doi.org/10.5281/zenodo.999353) for the [Open Science in Practice](https://osip2017.epfl.ch) summer school at EPFL, 2017-09-29. 218 | 219 |
220 | 221 |
~10 dataset lists 222 | 223 | * 224 | * 225 | * 226 | * 227 | * 228 | * 229 | * 230 | * 231 | * 232 | * 233 | * 234 | 235 |
236 | 237 | ## Contributing 238 | 239 | Contribute by opening an [issue](https://github.com/mdeff/fma/issues) or a [pull request](https://github.com/mdeff/fma/pulls). 240 | Let this repository be a hub around the dataset! 241 | 242 | ## History 243 | 244 | **2017-05-09 pre-publication release** 245 | * paper: [arXiv:1612.01840v2](https://arxiv.org/abs/1612.01840v2) 246 | * code: [git tag rc1](https://github.com/mdeff/fma/releases/tag/rc1) 247 | * `fma_metadata.zip` sha1: `f0df49ffe5f2a6008d7dc83c6915b31835dfe733` 248 | * `fma_small.zip` sha1: `ade154f733639d52e35e32f5593efe5be76c6d70` 249 | * `fma_medium.zip` sha1: `c67b69ea232021025fca9231fc1c7c1a063ab50b` 250 | * `fma_large.zip` sha1: `497109f4dd721066b5ce5e5f250ec604dc78939e` 251 | * `fma_full.zip` sha1: `0f0ace23fbe9ba30ecb7e95f763e435ea802b8ab` 252 | * known issues: see [#41](https://github.com/mdeff/fma/issues/41) 253 | 254 | **2016-12-06 beta release** 255 | * paper: [arXiv:1612.01840v1](https://arxiv.org/abs/1612.01840v1) 256 | * code: [git tag beta](https://github.com/mdeff/fma/releases/tag/beta) 257 | * `fma_small.zip` sha1: `e731a5d56a5625f7b7f770923ee32922374e2cbf` 258 | * `fma_medium.zip` sha1: `fe23d6f2a400821ed1271ded6bcd530b7a8ea551` 259 | 260 | ## Acknowledgments and Licenses 261 | 262 | We are grateful to the [Swiss Data Science Center] ([EPFL] and [ETHZ]) for hosting the dataset. 263 | 264 | Please cite our work if you use our code or data. 265 | 266 | ``` 267 | @inproceedings{fma_dataset, 268 | title = {{FMA}: A Dataset for Music Analysis}, 269 | author = {Defferrard, Micha\"el and Benzi, Kirell and Vandergheynst, Pierre and Bresson, Xavier}, 270 | booktitle = {18th International Society for Music Information Retrieval Conference (ISMIR)}, 271 | year = {2017}, 272 | archiveprefix = {arXiv}, 273 | eprint = {1612.01840}, 274 | url = {https://arxiv.org/abs/1612.01840}, 275 | } 276 | ``` 277 | 278 | ``` 279 | @inproceedings{fma_challenge, 280 | title = {Learning to Recognize Musical Genre from Audio}, 281 | subtitle = {Challenge Overview}, 282 | author = {Defferrard, Micha\"el and Mohanty, Sharada P. and Carroll, Sean F. and Salath\'e, Marcel}, 283 | booktitle = {The 2018 Web Conference Companion}, 284 | year = {2018}, 285 | publisher = {ACM Press}, 286 | isbn = {9781450356404}, 287 | doi = {10.1145/3184558.3192310}, 288 | archiveprefix = {arXiv}, 289 | eprint = {1803.05337}, 290 | url = {https://arxiv.org/abs/1803.05337}, 291 | } 292 | ``` 293 | 294 | * The code in this repository is released under the [MIT license](LICENSE.txt). 295 | * The metadata is released under the [Creative Commons Attribution 4.0 International License (CC BY 4.0)][ccby40]. 296 | * We do not hold the copyright on the audio and distribute it under the license chosen by the artist. 297 | * The dataset is meant for research purposes. 298 | 299 | [ccby40]: https://creativecommons.org/licenses/by/4.0 300 | [Swiss Data Science Center]: https://datascience.ch/collaboration-and-partnerships 301 | [EPFL]: https://www.epfl.ch 302 | [ETHZ]: https://www.ethz.ch 303 | -------------------------------------------------------------------------------- /baselines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n", 8 | "\n", 9 | "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n", 10 | "\n", 11 | "## Baselines\n", 12 | "\n", 13 | "* This notebook evaluates standard classifiers from scikit-learn on the provided features.\n", 14 | "* Moreover, it evaluates Deep Learning models on both audio and spectrograms." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import time\n", 24 | "import os\n", 25 | "\n", 26 | "import IPython.display as ipd\n", 27 | "from tqdm import tqdm_notebook\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "import keras\n", 31 | "from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape\n", 32 | "\n", 33 | "from sklearn.utils import shuffle\n", 34 | "from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler\n", 35 | "from sklearn.linear_model import LogisticRegression\n", 36 | "from sklearn.neighbors import KNeighborsClassifier\n", 37 | "from sklearn.svm import SVC, LinearSVC\n", 38 | "#from sklearn.gaussian_process import GaussianProcessClassifier\n", 39 | "#from sklearn.gaussian_process.kernels import RBF\n", 40 | "from sklearn.tree import DecisionTreeClassifier\n", 41 | "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", 42 | "from sklearn.neural_network import MLPClassifier\n", 43 | "from sklearn.naive_bayes import GaussianNB\n", 44 | "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n", 45 | "from sklearn.multiclass import OneVsRestClassifier\n", 46 | "\n", 47 | "import utils" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "AUDIO_DIR = os.environ.get('AUDIO_DIR')\n", 57 | "\n", 58 | "tracks = utils.load('data/fma_metadata/tracks.csv')\n", 59 | "features = utils.load('data/fma_metadata/features.csv')\n", 60 | "echonest = utils.load('data/fma_metadata/echonest.csv')\n", 61 | "\n", 62 | "np.testing.assert_array_equal(features.index, tracks.index)\n", 63 | "assert echonest.index.isin(tracks.index).all()\n", 64 | "\n", 65 | "tracks.shape, features.shape, echonest.shape" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## Subset" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "subset = tracks.index[tracks['set', 'subset'] <= 'medium']\n", 82 | "\n", 83 | "assert subset.isin(tracks.index).all()\n", 84 | "assert subset.isin(features.index).all()\n", 85 | "\n", 86 | "features_all = features.join(echonest, how='inner').sort_index(axis=1)\n", 87 | "print('Not enough Echonest features: {}'.format(features_all.shape))\n", 88 | "\n", 89 | "tracks = tracks.loc[subset]\n", 90 | "features_all = features.loc[subset]\n", 91 | "\n", 92 | "tracks.shape, features_all.shape" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "train = tracks.index[tracks['set', 'split'] == 'training']\n", 102 | "val = tracks.index[tracks['set', 'split'] == 'validation']\n", 103 | "test = tracks.index[tracks['set', 'split'] == 'test']\n", 104 | "\n", 105 | "print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))\n", 106 | "\n", 107 | "genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)\n", 108 | "#genres = list(tracks['track', 'genre_top'].unique())\n", 109 | "print('Top genres ({}): {}'.format(len(genres), genres))\n", 110 | "genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)\n", 111 | "print('All genres ({}): {}'.format(len(genres), genres))" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "## 1 Multiple classifiers and feature sets\n", 119 | "\n", 120 | "Todo:\n", 121 | "* Cross-validation for hyper-parameters.\n", 122 | "* Dimensionality reduction?" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "### 1.1 Pre-processing" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "def pre_process(tracks, features, columns, multi_label=False, verbose=False):\n", 139 | " if not multi_label:\n", 140 | " # Assign an integer value to each genre.\n", 141 | " enc = LabelEncoder()\n", 142 | " labels = tracks['track', 'genre_top']\n", 143 | " #y = enc.fit_transform(tracks['track', 'genre_top'])\n", 144 | " else:\n", 145 | " # Create an indicator matrix.\n", 146 | " enc = MultiLabelBinarizer()\n", 147 | " labels = tracks['track', 'genres_all']\n", 148 | " #labels = tracks['track', 'genres']\n", 149 | "\n", 150 | " # Split in training, validation and testing sets.\n", 151 | " y_train = enc.fit_transform(labels[train])\n", 152 | " y_val = enc.transform(labels[val])\n", 153 | " y_test = enc.transform(labels[test])\n", 154 | " X_train = features.loc[train, columns].as_matrix()\n", 155 | " X_val = features.loc[val, columns].as_matrix()\n", 156 | " X_test = features.loc[test, columns].as_matrix()\n", 157 | " \n", 158 | " X_train, y_train = shuffle(X_train, y_train, random_state=42)\n", 159 | " \n", 160 | " # Standardize features by removing the mean and scaling to unit variance.\n", 161 | " scaler = StandardScaler(copy=False)\n", 162 | " scaler.fit_transform(X_train)\n", 163 | " scaler.transform(X_val)\n", 164 | " scaler.transform(X_test)\n", 165 | " \n", 166 | " return y_train, y_val, y_test, X_train, X_val, X_test" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### 1.2 Single genre" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "def test_classifiers_features(classifiers, feature_sets, multi_label=False):\n", 183 | " columns = list(classifiers.keys()).insert(0, 'dim')\n", 184 | " scores = pd.DataFrame(columns=columns, index=feature_sets.keys())\n", 185 | " times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())\n", 186 | " for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):\n", 187 | " y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)\n", 188 | " scores.loc[fset_name, 'dim'] = X_train.shape[1]\n", 189 | " for clf_name, clf in classifiers.items(): # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):\n", 190 | " t = time.process_time()\n", 191 | " clf.fit(X_train, y_train)\n", 192 | " score = clf.score(X_test, y_test)\n", 193 | " scores.loc[fset_name, clf_name] = score\n", 194 | " times.loc[fset_name, clf_name] = time.process_time() - t\n", 195 | " return scores, times\n", 196 | "\n", 197 | "def format_scores(scores):\n", 198 | " def highlight(s):\n", 199 | " is_max = s == max(s[1:])\n", 200 | " return ['background-color: yellow' if v else '' for v in is_max]\n", 201 | " scores = scores.style.apply(highlight, axis=1)\n", 202 | " return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "classifiers = {\n", 212 | " 'LR': LogisticRegression(),\n", 213 | " 'kNN': KNeighborsClassifier(n_neighbors=200),\n", 214 | " 'SVCrbf': SVC(kernel='rbf'),\n", 215 | " 'SVCpoly1': SVC(kernel='poly', degree=1),\n", 216 | " 'linSVC1': SVC(kernel=\"linear\"),\n", 217 | " 'linSVC2': LinearSVC(),\n", 218 | " #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),\n", 219 | " 'DT': DecisionTreeClassifier(max_depth=5),\n", 220 | " 'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n", 221 | " 'AdaBoost': AdaBoostClassifier(n_estimators=10),\n", 222 | " 'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),\n", 223 | " 'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),\n", 224 | " 'NB': GaussianNB(),\n", 225 | " 'QDA': QuadraticDiscriminantAnalysis(),\n", 226 | "}\n", 227 | "\n", 228 | "feature_sets = {\n", 229 | "# 'echonest_audio': ('echonest', 'audio_features'),\n", 230 | "# 'echonest_social': ('echonest', 'social_features'),\n", 231 | "# 'echonest_temporal': ('echonest', 'temporal_features'),\n", 232 | "# 'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),\n", 233 | "# 'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),\n", 234 | "}\n", 235 | "for name in features.columns.levels[0]:\n", 236 | " feature_sets[name] = name\n", 237 | "feature_sets.update({\n", 238 | " 'mfcc/contrast': ['mfcc', 'spectral_contrast'],\n", 239 | " 'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],\n", 240 | " 'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],\n", 241 | " 'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],\n", 242 | " 'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],\n", 243 | " 'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],\n", 244 | " 'all_non-echonest': list(features.columns.levels[0])\n", 245 | "})\n", 246 | "\n", 247 | "scores, times = test_classifiers_features(classifiers, feature_sets)\n", 248 | "\n", 249 | "ipd.display(format_scores(scores))\n", 250 | "ipd.display(times.style.format('{:.4f}'))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### 1.3 Multiple genres\n", 258 | "\n", 259 | "Todo:\n", 260 | "* Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "classifiers = {\n", 270 | " #LogisticRegression(),\n", 271 | " 'LR': OneVsRestClassifier(LogisticRegression()),\n", 272 | " 'SVC': OneVsRestClassifier(SVC()),\n", 273 | " 'MLP': MLPClassifier(max_iter=700),\n", 274 | "}\n", 275 | "\n", 276 | "feature_sets = {\n", 277 | "# 'echonest_audio': ('echonest', 'audio_features'),\n", 278 | "# 'echonest_temporal': ('echonest', 'temporal_features'),\n", 279 | " 'mfcc': 'mfcc',\n", 280 | " 'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],\n", 281 | " 'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],\n", 282 | "}\n", 283 | "\n", 284 | "scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)\n", 285 | "\n", 286 | "ipd.display(format_scores(scores))\n", 287 | "ipd.display(times.style.format('{:.4f}'))" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "## 2 Deep learning on raw audio\n", 295 | "\n", 296 | "Other architectures:\n", 297 | "* [Learning Features of Music from Scratch (MusicNet)](https://arxiv.org/abs/1611.09827), John Thickstun, Zaid Harchaoui, Sham Kakade." 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])\n", 307 | "labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:\n", 315 | "* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg\n", 316 | " * resampling is very slow --> use `kaiser_fast`\n", 317 | " * does not work with multi-processing, for keras `fit_generator()`\n", 318 | "* pydub is a high-level interface for audio modification, uses ffmpeg to load\n", 319 | " * store a temporary `.wav`\n", 320 | "* directly pipe ffmpeg output\n", 321 | " * fastest method\n", 322 | "* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "# Just be sure that everything is fine. Multiprocessing is tricky to debug.\n", 332 | "utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))\n", 333 | "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())\n", 334 | "SampleLoader(train, batch_size=2).__next__()[0].shape" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Keras parameters.\n", 344 | "NB_WORKER = len(os.sched_getaffinity(0)) # number of usables CPUs\n", 345 | "params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "### 2.1 Fully connected neural network\n", 353 | "\n", 354 | "* Two layers with 10 hiddens is no better than random, ~11%.\n", 355 | "\n", 356 | "Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed." 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "loader = utils.FfmpegLoader(sampling_rate=2000)\n", 366 | "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n", 367 | "print('Dimensionality: {}'.format(loader.shape))\n", 368 | "\n", 369 | "keras.backend.clear_session()\n", 370 | "\n", 371 | "model = keras.models.Sequential()\n", 372 | "model.add(Dense(output_dim=1000, input_shape=loader.shape))\n", 373 | "model.add(Activation(\"relu\"))\n", 374 | "model.add(Dense(output_dim=100))\n", 375 | "model.add(Activation(\"relu\"))\n", 376 | "model.add(Dense(output_dim=labels_onehot.shape[1]))\n", 377 | "model.add(Activation(\"softmax\"))\n", 378 | "\n", 379 | "optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)\n", 380 | "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n", 381 | "\n", 382 | "model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)\n", 383 | "loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)\n", 384 | "loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)\n", 385 | "#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);\n", 386 | "\n", 387 | "loss" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "### 2.2 Convolutional neural network\n", 395 | "\n", 396 | "* Architecture: [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf), Sander Dieleman, Benjamin Schrauwen.\n", 397 | "* Missing: track segmentation and class averaging (majority voting)\n", 398 | "* Compared with log-scaled mel-spectrograms instead of strided convolution as first layer.\n", 399 | "* Larger net: http://benanne.github.io/2014/08/05/spotify-cnns.html" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "loader = utils.FfmpegLoader(sampling_rate=16000)\n", 409 | "#loader = utils.LibrosaLoader(sampling_rate=16000)\n", 410 | "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n", 411 | "\n", 412 | "keras.backend.clear_session()\n", 413 | "\n", 414 | "model = keras.models.Sequential()\n", 415 | "model.add(Reshape((-1, 1), input_shape=loader.shape))\n", 416 | "print(model.output_shape)\n", 417 | "\n", 418 | "model.add(Conv1D(128, 512, subsample_length=512))\n", 419 | "print(model.output_shape)\n", 420 | "model.add(Activation(\"relu\"))\n", 421 | "\n", 422 | "model.add(Conv1D(32, 8))\n", 423 | "print(model.output_shape)\n", 424 | "model.add(Activation(\"relu\"))\n", 425 | "model.add(MaxPooling1D(4))\n", 426 | "\n", 427 | "model.add(Conv1D(32, 8))\n", 428 | "print(model.output_shape)\n", 429 | "model.add(Activation(\"relu\"))\n", 430 | "model.add(MaxPooling1D(4))\n", 431 | "\n", 432 | "print(model.output_shape)\n", 433 | "#model.add(Dropout(0.25))\n", 434 | "model.add(Flatten())\n", 435 | "print(model.output_shape)\n", 436 | "model.add(Dense(100))\n", 437 | "model.add(Activation(\"relu\"))\n", 438 | "print(model.output_shape)\n", 439 | "model.add(Dense(labels_onehot.shape[1]))\n", 440 | "model.add(Activation(\"softmax\"))\n", 441 | "print(model.output_shape)\n", 442 | "\n", 443 | "optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)\n", 444 | "#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)\n", 445 | "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n", 446 | "\n", 447 | "model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)\n", 448 | "loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)\n", 449 | "loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)\n", 450 | "\n", 451 | "loss" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "### 2.3 Recurrent neural network" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "## 3 Deep learning on extracted audio features\n", 466 | "\n", 467 | "Look at:\n", 468 | "* Pre-processing in Keras: https://github.com/keunwoochoi/kapre\n", 469 | "* Convolutional Recurrent Neural Networks for Music Classification: https://github.com/keunwoochoi/icassp_2017\n", 470 | "* Music Auto-Tagger: https://github.com/keunwoochoi/music-auto_tagging-keras\n", 471 | "* Pre-processor: https://github.com/bmcfee/pumpp" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "### 3.1 ConvNet on MFCC\n", 479 | "\n", 480 | "* Architecture: [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf), Tom LH. Li, Antoni B. Chan and Andy HW. Chun\n", 481 | "* Missing: track segmentation and majority voting.\n", 482 | "* Best seen: 17.6%" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "class MfccLoader(utils.Loader):\n", 492 | " raw_loader = utils.FfmpegLoader(sampling_rate=22050)\n", 493 | " #shape = (13, 190) # For segmented tracks.\n", 494 | " shape = (13, 2582)\n", 495 | " def load(self, filename):\n", 496 | " import librosa\n", 497 | " x = self.raw_loader.load(filename)\n", 498 | " # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.\n", 499 | " mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)\n", 500 | " return mfcc\n", 501 | "\n", 502 | "loader = MfccLoader()\n", 503 | "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n", 504 | "loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": null, 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "keras.backend.clear_session()\n", 514 | "\n", 515 | "model = keras.models.Sequential()\n", 516 | "model.add(Reshape((*loader.shape, 1), input_shape=loader.shape))\n", 517 | "print(model.output_shape)\n", 518 | "\n", 519 | "model.add(Conv2D(3, 13, 10, subsample=(1, 4)))\n", 520 | "model.add(Activation(\"relu\"))\n", 521 | "print(model.output_shape)\n", 522 | "\n", 523 | "model.add(Conv2D(15, 1, 10, subsample=(1, 4)))\n", 524 | "model.add(Activation(\"relu\"))\n", 525 | "print(model.output_shape)\n", 526 | "\n", 527 | "model.add(Conv2D(65, 1, 10, subsample=(1, 4)))\n", 528 | "model.add(Activation(\"relu\"))\n", 529 | "print(model.output_shape)\n", 530 | "\n", 531 | "model.add(Flatten())\n", 532 | "print(model.output_shape)\n", 533 | "model.add(Dense(labels_onehot.shape[1]))\n", 534 | "model.add(Activation(\"softmax\"))\n", 535 | "print(model.output_shape)\n", 536 | "\n", 537 | "optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)\n", 538 | "#optimizer = keras.optimizers.Adam()#lr=1e-5)#\n", 539 | "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n", 540 | "\n", 541 | "model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)\n", 542 | "loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)\n", 543 | "loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)\n", 544 | "#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)\n", 545 | "\n", 546 | "loss" 547 | ] 548 | } 549 | ], 550 | "metadata": {}, 551 | "nbformat": 4, 552 | "nbformat_minor": 1 553 | } 554 | -------------------------------------------------------------------------------- /analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n", 8 | "\n", 9 | "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n", 10 | "\n", 11 | "## Analysis\n", 12 | "\n", 13 | "All numbers and figures which appear in the [paper] and much more.\n", 14 | "\n", 15 | "[paper]: https://arxiv.org/abs/1612.01840" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "%matplotlib inline\n", 25 | "\n", 26 | "import IPython.display as ipd\n", 27 | "import numpy as np\n", 28 | "import pandas as pd\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import seaborn as sns\n", 31 | "from sklearn.preprocessing import MultiLabelBinarizer\n", 32 | "\n", 33 | "import utils\n", 34 | "\n", 35 | "sns.set_context(\"notebook\", font_scale=1.5)\n", 36 | "plt.rcParams['figure.figsize'] = (17, 5)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "tracks = utils.load('data/fma_metadata/tracks.csv')\n", 46 | "genres = utils.load('data/fma_metadata/genres.csv')\n", 47 | "features = utils.load('data/fma_metadata/features.csv')\n", 48 | "echonest = utils.load('data/fma_metadata/echonest.csv')\n", 49 | "\n", 50 | "np.testing.assert_array_equal(features.index, tracks.index)\n", 51 | "assert echonest.index.isin(tracks.index).all()\n", 52 | "\n", 53 | "tracks.shape, genres.shape, features.shape, echonest.shape" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## 1 Size\n", 61 | "\n", 62 | "Todo:\n", 63 | "* When are tracks mostly added.\n", 64 | "* Which tracks got deleted." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "print('{} tracks, {} artists, {} albums, {} genres'.format(\n", 74 | " len(tracks), len(tracks['artist', 'id'].unique()),\n", 75 | " len(tracks['album', 'id'].unique()),\n", 76 | " sum(genres['#tracks'] > 0)))\n", 77 | "mean_duration = tracks['track', 'duration'].mean()\n", 78 | "print('track duration: {:.0f} days total, {:.0f} seconds average'.format(\n", 79 | " sum(tracks['track', 'duration']) / 3600 / 24,\n", 80 | " mean_duration))" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "dimensionality = mean_duration * 44000 * 2\n", 90 | "print('sample dimensionality: {:.1e}'.format(dimensionality))\n", 91 | "print('total size, i.e. number of audio samples: {:.1e}'.format(dimensionality * len(tracks)))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "for subset in tracks['set', 'subset'].unique():\n", 101 | " indicator = tracks['set', 'subset'] <= subset\n", 102 | " print('{:6} {:6} tracks {:.1f} days'.format(\n", 103 | " subset, sum(indicator), sum(indicator) * 30 / 3600 / 24))" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "print('{} deleted tracks (largest track_id is {})'.format(tracks.index.max() - len(tracks), tracks.index.max()))\n", 113 | "print('First track: {}'.format(tracks['track', 'date_created'].min()))\n", 114 | "\n", 115 | "d = pd.DataFrame(tracks.index, index=tracks['track', 'date_created'].values)\n", 116 | "d['indicator'] = 1\n", 117 | "\n", 118 | "fig, ax1 = plt.subplots()\n", 119 | "ax2 = ax1.twinx()\n", 120 | "\n", 121 | "d['track_id'].plot(ax=ax1)\n", 122 | "d['indicator'].cumsum().plot(ax=ax1)\n", 123 | "ax1.set_ylabel('#tracks')\n", 124 | "ax1.set_ylim(0, 160000)\n", 125 | "\n", 126 | "(d['indicator'] * -100).plot(ax=ax2, style='r') # needed for no apparent reason\n", 127 | "color = sns.color_palette('deep', 3)[2]\n", 128 | "d['indicator'].resample('2M').sum().fillna(0).plot(ax=ax2, style='--', color=color)\n", 129 | "ax2.set_ylabel('#tracks added')\n", 130 | "ax2.set_ylim(500, 4500)\n", 131 | "ax2.set_ylim(0, 4000)\n", 132 | "ax2.grid(False)\n", 133 | "\n", 134 | "lns = ax1.get_lines() + [ax2.get_lines()[1]]\n", 135 | "ax1.legend(lns, ['largest track id', '#tracks still present', '#tracks added per 2 months'], loc='lower right')\n", 136 | "\n", 137 | "plt.savefig('figures/growth.pdf')" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "### 1.1 Splits" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "SPLITS = ['training', 'validation', 'test']\n", 154 | "SUBSETS = ['small', 'medium', 'large']\n", 155 | "print('subset #train #val #test val_ratio test_ratio')\n", 156 | "for subset in SUBSETS:\n", 157 | " counts = [sum((tracks['set', 'split'] == split) & (tracks['set', 'subset'] <= subset)) for split in SPLITS]\n", 158 | " ratios = np.array(counts[0] / counts[1:])\n", 159 | " print('{:8s} {:7d} {:7d} {:7d} {:8.2f} {:9.2f}'.format(subset, *counts, *ratios))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "for subset in ['small', 'medium']:\n", 169 | " subset = tracks['set', 'subset'] <= subset\n", 170 | "\n", 171 | " d = genres.reset_index().set_index('title')\n", 172 | " d = d.loc[tracks.loc[subset, ('track', 'genre_top')].unique()]\n", 173 | "\n", 174 | " for split in SPLITS:\n", 175 | " b = tracks['set', 'split'] == split\n", 176 | " d['#' + split] = tracks.loc[subset & b, ('track', 'genre_top')].value_counts()\n", 177 | "\n", 178 | " d['val_ratio'] = d['#training'] / d['#validation']\n", 179 | " d['test_ratio'] = d['#training'] / d['#test']\n", 180 | "\n", 181 | " ipd.display(d.sort_values('#training', ascending=False))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "d = pd.DataFrame(index=genres.index, columns=SPLITS)\n", 191 | "for genre in genres.index:\n", 192 | " b = tracks['track', 'genres_all'].map(lambda genres: genre in genres)\n", 193 | " d.loc[genre] = tracks.loc[b, ('set', 'split')].value_counts()\n", 194 | "d['val_ratio'] = d['training'] / d['validation']\n", 195 | "d['test_ratio'] = d['training'] / d['test']\n", 196 | "d.sort_values('training', ascending=False, inplace=True)\n", 197 | "ipd.display(d.head(10))\n", 198 | "ipd.display(d.tail(10))" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": {}, 204 | "source": [ 205 | "## 2 Metadata" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": null, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "def isnull(column, df=tracks):\n", 215 | " if column[1] in ['tags', 'genres', 'genres_all']:\n", 216 | " return df[column].apply(lambda x: len(x) == 0)\n", 217 | " elif df.dtypes[column] == np.int:\n", 218 | " return df[column] <= 0\n", 219 | " else:\n", 220 | " return df[column].isnull()\n", 221 | "\n", 222 | "def count(series):\n", 223 | " col0 = series.name[0]\n", 224 | " df = tracks if col0 == 'track' else tracks.drop_duplicates((col0, 'id'))\n", 225 | " n = (~isnull(series.name, df)).sum()\n", 226 | " p = n / len(df) * 100\n", 227 | " return n, p\n", 228 | "\n", 229 | "# Columns / metadata usage across dataset.\n", 230 | "d = pd.DataFrame(index=tracks.columns.drop('set'), columns=['n', 'p'])\n", 231 | "d = d.apply(count, axis=1)\n", 232 | "d['n'] = d['n'].astype(np.int)\n", 233 | "d" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# Excerpt as example in the paper.\n", 243 | "columns = [\n", 244 | " ('track', 'title'),\n", 245 | " ('track', 'genres_all'),\n", 246 | " ('track', 'genre_top'),\n", 247 | " ('track', 'duration'),\n", 248 | " ('track', 'listens'),\n", 249 | " ('album', 'title'),\n", 250 | " ('album', 'listens'),\n", 251 | " ('album', 'tags'),\n", 252 | " ('artist', 'name'),\n", 253 | " ('artist', 'location'),\n", 254 | "]\n", 255 | "\n", 256 | "non_null = ~isnull(columns[0])\n", 257 | "for column in columns[1:]:\n", 258 | " non_null &= ~isnull(column)\n", 259 | "tids = np.random.RandomState(42).permutation(tracks.index[non_null])[:8]\n", 260 | "\n", 261 | "tracks.loc[tids, columns].head()\n", 262 | "\n", 263 | "#tracks.loc[tids, columns].to_latex('figures/tracks.tex', formatters={\n", 264 | "# ('artist', 'longitude'): '{:,.1f}'.format,\n", 265 | "# ('artist', 'latitude'): '{:,.1f}'.format,\n", 266 | "#})" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "tracks['track', 'license'].value_counts().head(10)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "tracks['track', 'language_code'].value_counts().head(10)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": {}, 290 | "source": [ 291 | "### 2.1 Technical data" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "durations = tracks['track', 'duration']\n", 301 | "plt.figure(figsize=(10, 4)) # Poster: (7, 3)\n", 302 | "p = sns.distplot(durations[durations.values < 800], kde=False, rug=False, color='k', hist_kws=dict(alpha=0.4))\n", 303 | "p.set_xlabel('duration [seconds]')\n", 304 | "p.set_ylabel('#tracks')\n", 305 | "p.set_xlim(0, 800) # Poster: 500\n", 306 | "plt.tight_layout()\n", 307 | "plt.savefig('figures/duration_distribution.pdf')\n", 308 | "\n", 309 | "durations.describe()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "# Uncommon bit rates are VBR encodings.\n", 319 | "print('Common bit rates: {}'.format(tracks['track', 'bit_rate'].value_counts().head(5).index.tolist()))\n", 320 | "print('Average bit rate: {:.0f} kbit/s'.format(tracks['track', 'bit_rate'].mean()/1000))\n", 321 | "p = sns.distplot(tracks['track', 'bit_rate'], kde=False, rug=False)\n", 322 | "p.set_xlabel('bit rate')\n", 323 | "p.set_ylabel('#tracks');" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "### 2.2 User data" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "# Tags.\n", 340 | "d1 = tracks['track', 'tags'].apply(len)\n", 341 | "d2 = tracks.drop_duplicates(('album', 'id'))\n", 342 | "d2 = d2['album', 'tags'].apply(len)\n", 343 | "d3 = tracks.drop_duplicates(('artist', 'id'))\n", 344 | "d3 = d3['artist', 'tags'].apply(len) - 1\n", 345 | "\n", 346 | "labels = ['track', 'album', 'artist']\n", 347 | "for l, d in zip(labels, [d1, d2, d3]):\n", 348 | " print('{}: from {} to {} tags'.format(l, max(d.min(), 0), d.max()))\n", 349 | "\n", 350 | "MAX = 13 # Poster: 11\n", 351 | "fig, ax1 = plt.subplots(figsize=(10, 4)) # Poster: (7, 3)\n", 352 | "ax2 = ax1.twinx()\n", 353 | "\n", 354 | "ax1.hist(d1, bins=np.arange(MAX)+0.25, rwidth=0.2, color='C0', label=labels[0])\n", 355 | "ax2.hist(d2, bins=np.arange(MAX)+0.50, rwidth=0.2, color='C1', label=labels[1])\n", 356 | "ax2.hist(d3, bins=np.arange(MAX)+0.75, rwidth=0.2, color='C2', label=labels[2])\n", 357 | "\n", 358 | "ax1.set_xlabel('#tags')\n", 359 | "ax1.set_ylabel('#tracks')\n", 360 | "ax2.set_ylabel('#artists / #albums')\n", 361 | "ax1.set_xlim(0.5, MAX-0.5)\n", 362 | "ax1.set_xticks(range(1, MAX))\n", 363 | "ax1.set_ylim(0, 5000)\n", 364 | "ax2.set_ylim(0, 500)\n", 365 | "ax1.legend(loc='upper center')\n", 366 | "ax2.legend(loc='upper right')\n", 367 | "ax2.grid(False)\n", 368 | "\n", 369 | "fig.tight_layout()\n", 370 | "fig.savefig('figures/tag_distribution.pdf')" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "# One artist tag is often the artist name.\n", 380 | "col = 'artist'\n", 381 | "d = tracks.drop_duplicates((col, 'id'))\n", 382 | "d.loc[d[col, 'tags'].apply(len) > 0, [('artist', 'name'), (col, 'tags')]].head()" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "# Listens, favorites, comments.\n", 392 | "\n", 393 | "def plot(col0, col1, maxval, subplot=None):\n", 394 | " if col0 == 'track':\n", 395 | " d = tracks['track']\n", 396 | " if col0 in ['artist', 'album']:\n", 397 | " d = tracks[col0].drop_duplicates('id')\n", 398 | " if subplot:\n", 399 | " plt.subplot(subplot)\n", 400 | " d = d[col1]\n", 401 | " p = sns.distplot(d[d.values < maxval], kde=False, color='k', hist_kws=dict(alpha=0.4))\n", 402 | " p.set_xlim(-1, maxval)\n", 403 | " p.set_xlabel('#' + col1)\n", 404 | " p.set_ylabel('#' + col0 + 's')\n", 405 | "\n", 406 | "plt.figure(figsize=(17, 10))\n", 407 | "plot('track', 'listens', 10e3, 221)\n", 408 | "plot('track', 'interest', 10e3, 222)\n", 409 | "plot('track', 'favorites', 100, 223)\n", 410 | "plot('track', 'comments', 20, 224)\n", 411 | "\n", 412 | "plt.figure(figsize=(17, 10))\n", 413 | "plot('album', 'listens', 100e3, 221)\n", 414 | "plot('album', 'favorites', 100, 223)\n", 415 | "plot('album', 'comments', 20, 224)\n", 416 | "\n", 417 | "plt.figure(figsize=(17, 5))\n", 418 | "plot('artist', 'favorites', 100, 121)\n", 419 | "plot('artist', 'comments', 20, 122)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "# Same as above, formated for the paper.\n", 429 | "plt.figure(figsize=(10, 4)) # Poster: (7, 3)\n", 430 | "plot('album', 'listens', 40e3) # Poster 20e3\n", 431 | "plt.tight_layout()\n", 432 | "plt.savefig('figures/listens_distribution.pdf')\n", 433 | "\n", 434 | "tracks['album', 'listens'].max()" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "# Most listened albums.\n", 444 | "tracks['album'].groupby('id').first().sort_values('listens', ascending=False).head(10)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "### 2.3 Dates" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "def plot(col0, col1):\n", 461 | " if col0 == 'track':\n", 462 | " d = tracks['track']\n", 463 | " if col0 in ['artist', 'album']:\n", 464 | " d = tracks[col0].drop_duplicates('id')\n", 465 | " d = pd.Series(1, index=d[col1])\n", 466 | " d.resample('A').sum().fillna(0).plot()\n", 467 | "\n", 468 | "plt.figure()\n", 469 | "plot('track', 'date_recorded')\n", 470 | "plot('album', 'date_released')\n", 471 | "\n", 472 | "plt.figure()\n", 473 | "plot('artist', 'active_year_begin')\n", 474 | "plot('artist', 'active_year_end')\n", 475 | "\n", 476 | "plt.figure()\n", 477 | "plot('track', 'date_created')\n", 478 | "plot('album', 'date_created')\n", 479 | "plot('artist', 'date_created')" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# Same as above, formated for the paper.\n", 489 | "plt.figure(figsize=(5, 4))\n", 490 | "d = tracks['album'].drop_duplicates('id')\n", 491 | "d = pd.Series(1, index=d['date_released'])\n", 492 | "d = d.resample('A').sum().fillna(0)\n", 493 | "b = d.index >= pd.to_datetime(1990, format='%Y')\n", 494 | "b &= d.index <= pd.to_datetime(2017, format='%Y')\n", 495 | "d[b].plot(color='k')\n", 496 | "plt.xlabel('release year')\n", 497 | "plt.ylabel('#albums')\n", 498 | "plt.tight_layout()\n", 499 | "plt.savefig('figures/album_release_year.pdf')\n", 500 | "\n", 501 | "d.index.min().year, d.index.max().year" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "## 3 Artists & albums effect" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "for effect in ['artist', 'album']:\n", 518 | " d = tracks[effect, 'id'].value_counts()\n", 519 | " ipd.display(d.head(5))\n", 520 | " p = sns.distplot(d[(d.values < 50) & (d.values >= 0)], kde=False)\n", 521 | " p.set_xlabel('#tracks per ' + effect);\n", 522 | " p.set_ylabel('#' + effect + 's');" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "counts = pd.Series(index=genres.loc[genres['parent'] == 0, 'title'].values, name='#artists')\n", 532 | "for genre in counts.index:\n", 533 | " counts[genre] = len(tracks.loc[tracks['track', 'genre_top'] == genre, ('artist', 'id')].unique())\n", 534 | "counts.sort_values(ascending=False).plot.bar()\n", 535 | "plt.ylabel('#artists');" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "## 4 Genres" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "a = set(tracks['track', 'genre_top'].unique().dropna())\n", 552 | "b = set(genres.loc[genres['top_level'].unique(), 'title'].values)\n", 553 | "assert a == b\n", 554 | "\n", 555 | "print('{} top-level genres'.format(len(a)))\n", 556 | "genres[genres['parent'] == 0].sort_values('#tracks', ascending=False)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "markdown", 561 | "metadata": {}, 562 | "source": [ 563 | "Number of genres per track:\n", 564 | "* `genres`: they have introduced a [limit of 3 genres per track](https://twitter.com/therewasaguy/status/863426542075953152) early on.\n", 565 | "* `genres_all`: more genres per track as all coarser genres in the hierarchy are included. E.g. an Indie-Rock song is counted as a Rock song too." 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "# Genres per track.\n", 575 | "labels = ['genres', 'genres_all'] #, 'genres_top']\n", 576 | "d = [tracks['track', label].map(len) for label in labels]\n", 577 | "labels = ['{}\\nmax: {}'.format(label, d1.max()) for label, d1 in zip(labels, d)]\n", 578 | "\n", 579 | "for l, d1 in zip(labels, d):\n", 580 | " print('{} per track: from {} to {} tags'.format(l, d1.min(), d1.max()))\n", 581 | "print('#tracks without genre: {}'.format((tracks['track', 'genres'].map(len) == 0).sum()))\n", 582 | "\n", 583 | "MAX = 9\n", 584 | "fig, ax = plt.subplots(figsize=(5, 4))\n", 585 | "ax.hist(d, bins=np.arange(MAX)-0.5, label=labels)\n", 586 | "ax.set_xlabel('#genres per track')\n", 587 | "ax.set_ylabel('#tracks')\n", 588 | "ax.set_xlim(-0.5, MAX-1.5)\n", 589 | "ax.set_xticks(range(MAX-1))\n", 590 | "ax.set_yticklabels(['0'] + ['{}0k'.format(i) for i in range(1, 6)])\n", 591 | "ax.legend(loc='upper right')\n", 592 | "fig.tight_layout()\n", 593 | "fig.savefig('figures/genres_per_track.pdf')" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "# Number of tracks per genre (full).\n", 603 | "d = genres[genres['#tracks'] > 2000].sort_values('#tracks', ascending=False) # Poster: 5000\n", 604 | "plt.figure(figsize=(10, 4)) # Poster: (7, 4)\n", 605 | "p = sns.barplot('title', '#tracks', data=d, color='k', alpha=0.4)\n", 606 | "p.set_xlabel('')\n", 607 | "p.set_ylabel('#tracks')\n", 608 | "plt.xticks(rotation=90)\n", 609 | "plt.tight_layout()\n", 610 | "plt.savefig('figures/genre_distribution.pdf')\n", 611 | "\n", 612 | "genres.loc[genres['#tracks'] > 0, '#tracks'].min(), genres['#tracks'].max()" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [ 621 | "# Number of tracks per top-level genre (medium).\n", 622 | "d = tracks[tracks['set', 'subset'] <= 'medium']\n", 623 | "d = d['track', 'genre_top'].value_counts()\n", 624 | "plt.figure(figsize=(10, 4)) # Poster: (7, 4)\n", 625 | "d.plot.bar(color='k', alpha=0.4)\n", 626 | "plt.ylabel('#tracks')\n", 627 | "plt.xlabel('')\n", 628 | "plt.tight_layout()\n", 629 | "plt.savefig('figures/genre_top_distribution.pdf')\n", 630 | "\n", 631 | "d" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "### 4.1 Genre hierarchy\n", 639 | "\n", 640 | "* As genres have parent genres, we can plot a tree using the [DOT] language.\n", 641 | "* Save the full genre tree as a PDF.\n", 642 | "\n", 643 | "Todo:\n", 644 | "* Color nodes according to FMA genre color.\n", 645 | "* Better looking tree.\n", 646 | "\n", 647 | "[DOT]: https://en.wikipedia.org/wiki/DOT_(graph_description_language)" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "g = utils.Genres(genres)\n", 657 | "graph = g.create_tree([25, 31], 1)\n", 658 | "ipd.Image(graph.create_png())" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": null, 664 | "metadata": {}, 665 | "outputs": [], 666 | "source": [ 667 | "graph = g.create_tree(14)\n", 668 | "graph.write_pdf('figures/genre_hierarchy.pdf');\n", 669 | "\n", 670 | "roots = g.find_roots()\n", 671 | "print('{} roots'.format(len(roots)))\n", 672 | "graph = g.create_tree(roots)\n", 673 | "graph.write_pdf('figures/genre_hierarchy.pdf');" 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "metadata": {}, 679 | "source": [ 680 | "### 4.2 Cross-appearance\n", 681 | "\n", 682 | "Todo:\n", 683 | "* Group rows and columns for better identification of related genres." 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "enc = MultiLabelBinarizer()\n", 693 | "genres_indicator = enc.fit_transform(tracks['track', 'genres'])\n", 694 | "genres_names = enc.classes_\n", 695 | "genres_names = genres.loc[enc.classes_, 'title'].values\n", 696 | "cross_correlation = genres_indicator.T @ genres_indicator" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": null, 702 | "metadata": {}, 703 | "outputs": [], 704 | "source": [ 705 | "np.fill_diagonal(cross_correlation, 0)\n", 706 | "\n", 707 | "plt.figure(figsize=(28, 28))\n", 708 | "plt.imshow(np.log(cross_correlation))\n", 709 | "plt.yticks(range(len(genres_names)), genres_names);\n", 710 | "plt.xticks(range(len(genres_names)), genres_names, rotation=90);" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": null, 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "cross_correlation = np.tril(cross_correlation, k=-1)\n", 720 | "sort = np.argsort(cross_correlation.flatten())\n", 721 | "\n", 722 | "N = 20\n", 723 | "indices = np.unravel_index(sort[:-N:-1], cross_correlation.shape)\n", 724 | "for i, j in zip(*indices):\n", 725 | " print('{}: {} | {}'.format(cross_correlation[i, j], genres_names[i], genres_names[j]))" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "## 5 Audio\n", 733 | "\n", 734 | "Todo: e.g. audio features (echonest / librosa, spectrograms) to show diversity." 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "## 6 Features\n", 742 | "\n", 743 | "Todo: understand features by listening to segments who have them, e.g. ." 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": null, 749 | "metadata": {}, 750 | "outputs": [], 751 | "source": [ 752 | "features.head(5).style.format('{:.2f}')" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": null, 758 | "metadata": {}, 759 | "outputs": [], 760 | "source": [ 761 | "sns.pairplot(features.loc[:, ('mfcc', 'mean', slice('01','03'))]);\n", 762 | "sns.pairplot(features.loc[:, ('mfcc', 'std', slice('01','03'))]);" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "## 7 Echonest features" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "metadata": {}, 776 | "outputs": [], 777 | "source": [ 778 | "print('Echonest features available for {} tracks.'.format(len(echonest)))" 779 | ] 780 | } 781 | ], 782 | "metadata": {}, 783 | "nbformat": 4, 784 | "nbformat_minor": 2 785 | } 786 | -------------------------------------------------------------------------------- /creation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n", 8 | "\n", 9 | "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n", 10 | "\n", 11 | "## Creation\n", 12 | "\n", 13 | "From `raw_*.csv`, this notebook generates:\n", 14 | "* `tracks.csv`: per-track / album / artist metadata.\n", 15 | "* `genres.csv`: genre hierarchy.\n", 16 | "* `echonest.csv`: cleaned Echonest features.\n", 17 | "\n", 18 | "A companion script, [creation.py](creation.py):\n", 19 | "1. Query the [API](https://freemusicarchive.org/api) and store metadata in `raw_tracks.csv`, `raw_albums.csv`, `raw_artists.csv` and `raw_genres.csv`.\n", 20 | "2. Download the audio for each track.\n", 21 | "3. Trim the audio to 30s clips.\n", 22 | "4. Normalize the permissions and modification / access times.\n", 23 | "5. Create the `.zip` archives." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import os\n", 33 | "import ast\n", 34 | "import pickle\n", 35 | "\n", 36 | "import IPython.display as ipd\n", 37 | "import numpy as np\n", 38 | "import pandas as pd\n", 39 | "\n", 40 | "import utils\n", 41 | "import creation" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "AUDIO_DIR = os.environ.get('AUDIO_DIR')\n", 51 | "BASE_DIR = os.path.abspath(os.path.dirname(AUDIO_DIR))\n", 52 | "FMA_FULL = os.path.join(BASE_DIR, 'fma_full')\n", 53 | "FMA_LARGE = os.path.join(BASE_DIR, 'fma_large')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## 1 Retrieve metadata and audio from FMA\n", 61 | "\n", 62 | "1. Crawl the tracks, albums and artists metadata through their [API](https://freemusicarchive.org/api).\n", 63 | "2. Download original `.mp3` by HTTPS for each track id (only if we don't have it already).\n", 64 | "\n", 65 | "Todo:\n", 66 | "* Scrap curators.\n", 67 | "* Download images (`track_image_file`, `album_image_file`, `artist_image_file`). Beware the quality.\n", 68 | "* Verify checksum for some random tracks.\n", 69 | "\n", 70 | "Dataset update:\n", 71 | "* To add new tracks: iterate from largest known track id to the most recent only.\n", 72 | "* To update user data: we need to get all tracks again." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "# ./creation.py metadata\n", 82 | "# ./creation.py data /path/to/fma/fma_full\n", 83 | "# ./creation.py clips /path/to/fma\n", 84 | "\n", 85 | "#!cat creation.py" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# converters={'genres': ast.literal_eval}\n", 95 | "tracks = pd.read_csv('raw_tracks.csv', index_col=0)\n", 96 | "albums = pd.read_csv('raw_albums.csv', index_col=0)\n", 97 | "artists = pd.read_csv('raw_artists.csv', index_col=0)\n", 98 | "genres = pd.read_csv('raw_genres.csv', index_col=0)\n", 99 | "\n", 100 | "not_found = pickle.load(open('not_found.pickle', 'rb'))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "def get_fs_tids(audio_dir):\n", 110 | " tids = []\n", 111 | " for _, dirnames, files in os.walk(audio_dir):\n", 112 | " if dirnames == []:\n", 113 | " tids.extend(int(file[:-4]) for file in files)\n", 114 | " return tids\n", 115 | "\n", 116 | "audio_tids = get_fs_tids(FMA_FULL)\n", 117 | "clips_tids = get_fs_tids(FMA_LARGE)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "print('tracks: {} collected ({} not found, {} max id)'.format(\n", 127 | " len(tracks), len(not_found['tracks']), tracks.index.max()))\n", 128 | "print('albums: {} collected ({} not found, {} in tracks)'.format(\n", 129 | " len(albums), len(not_found['albums']), len(tracks['album_id'].unique())))\n", 130 | "print('artists: {} collected ({} not found, {} in tracks)'.format(\n", 131 | " len(artists), len(not_found['artists']), len(tracks['artist_id'].unique())))\n", 132 | "print('genres: {} collected'.format(len(genres)))\n", 133 | "print('audio: {} collected ({} not found, {} not in tracks)'.format(\n", 134 | " len(audio_tids), len(not_found['audio']), len(set(audio_tids).difference(tracks.index))))\n", 135 | "print('clips: {} collected ({} not found, {} not in tracks)'.format(\n", 136 | " len(clips_tids), len(not_found['clips']), len(set(clips_tids).difference(tracks.index))))\n", 137 | "assert sum(tracks.index.isin(audio_tids)) + len(not_found['audio']) == len(tracks)\n", 138 | "assert sum(tracks.index.isin(clips_tids)) + len(not_found['clips']) == sum(tracks.index.isin(audio_tids))\n", 139 | "assert len(clips_tids) + len(not_found['clips']) + len(not_found['audio']) == len(tracks)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "N = 5\n", 149 | "ipd.display(tracks.head(N))\n", 150 | "ipd.display(albums.head(N))\n", 151 | "ipd.display(artists.head(N))\n", 152 | "ipd.display(genres.head(N))" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "## 2 Format metadata\n", 160 | "\n", 161 | "Todo:\n", 162 | "* Sanitize values, e.g. list of words for tags, valid links in `artist_wikipedia_page`, remove html markup in free-form text.\n", 163 | " * Clean tags. E.g. some tags are just artist names.\n", 164 | "* Fill metadata about encoding: length, number of samples, sample rate, bit rate, channels (mono/stereo), 16bits?.\n", 165 | "* Update duration from audio\n", 166 | " * 2624 is marked as 05:05:50 (18350s) although it is reported as 00:21:15.15 by ffmpeg.\n", 167 | " * 112067: 3714s --> 01:59:55.06, 112808: 3718s --> 01:59:59.56\n", 168 | " * ffmpeg: Estimating duration from bitrate, this may be inaccurate\n", 169 | " * Solution, decode the complete mp3: `ffmpeg -i input.mp3 -f null -`" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "df, column = tracks, 'tags'\n", 179 | "null = sum(df[column].isnull())\n", 180 | "print('{} null, {} non-null'.format(null, df.shape[0] - null))\n", 181 | "df[column].value_counts().head(10)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### 2.1 Tracks" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "drop = [\n", 198 | " 'license_image_file', 'license_image_file_large', 'license_parent_id', 'license_url', # keep title only\n", 199 | " 'track_file', 'track_image_file', # used to download only\n", 200 | " 'track_url', 'album_url', 'artist_url', # only relevant on website\n", 201 | " 'track_copyright_c', 'track_copyright_p', # present for ~1000 tracks only\n", 202 | " # 'track_composer', 'track_lyricist', 'track_publisher', # present for ~4000, <1000 and <2000 tracks\n", 203 | " 'track_disc_number', # different from 1 for <1000 tracks\n", 204 | " 'track_explicit', 'track_explicit_notes', # present for <4000 tracks\n", 205 | " 'track_instrumental' # ~6000 tracks have a 1, there is an instrumental genre\n", 206 | "]\n", 207 | "tracks.drop(drop, axis=1, inplace=True)\n", 208 | "tracks.rename(columns={'license_title': 'track_license', 'tags': 'track_tags'}, inplace=True)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "tracks['track_duration'] = tracks['track_duration'].map(creation.convert_duration)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "def convert_datetime(df, column, format=None):\n", 227 | " df[column] = pd.to_datetime(df[column], infer_datetime_format=True, format=format)\n", 228 | "convert_datetime(tracks, 'track_date_created')\n", 229 | "convert_datetime(tracks, 'track_date_recorded')" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "tracks['album_id'].fillna(-1, inplace=True)\n", 239 | "tracks['track_bit_rate'].fillna(-1, inplace=True)\n", 240 | "tracks = tracks.astype({'album_id': int, 'track_bit_rate': int})" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "def convert_genres(genres):\n", 250 | " genres = ast.literal_eval(genres)\n", 251 | " return [int(genre['genre_id']) for genre in genres]\n", 252 | "\n", 253 | "tracks['track_genres'].fillna('[]', inplace=True)\n", 254 | "tracks['track_genres'] = tracks['track_genres'].map(convert_genres)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "tracks.columns" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "### 2.2 Albums" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "drop = [\n", 280 | " 'artist_name', 'album_url', 'artist_url', # in tracks already (though it can be different)\n", 281 | " 'album_handle',\n", 282 | " 'album_image_file', 'album_images', # todo: shall be downloaded\n", 283 | " #'album_producer', 'album_engineer', # present for ~2400 albums only\n", 284 | "]\n", 285 | "albums.drop(drop, axis=1, inplace=True)\n", 286 | "albums.rename(columns={'tags': 'album_tags'}, inplace=True)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "convert_datetime(albums, 'album_date_created')\n", 296 | "convert_datetime(albums, 'album_date_released')" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "albums.columns" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### 2.3 Artists" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "drop = [\n", 322 | " 'artist_website', 'artist_url', # in tracks already (though it can be different)\n", 323 | " 'artist_handle',\n", 324 | " 'artist_image_file', 'artist_images', # todo: shall be downloaded\n", 325 | " 'artist_donation_url', 'artist_paypal_name', 'artist_flattr_name', # ~1600 & ~400 & ~70, not relevant\n", 326 | " 'artist_contact', # ~1500, not very useful data\n", 327 | " # 'artist_active_year_begin', 'artist_active_year_end', # ~1400, ~500 only\n", 328 | " # 'artist_associated_labels', # ~1000\n", 329 | " # 'artist_related_projects', # only ~800, but can be combined with bio\n", 330 | "]\n", 331 | "artists.drop(drop, axis=1, inplace=True)\n", 332 | "artists.rename(columns={'tags': 'artist_tags'}, inplace=True)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "convert_datetime(artists, 'artist_date_created')\n", 342 | "for column in ['artist_active_year_begin', 'artist_active_year_end']:\n", 343 | " artists[column].replace(0.0, np.nan, inplace=True)\n", 344 | " convert_datetime(artists, column, format='%Y.0')" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "artists.columns" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "### 2.4 Merge DataFrames" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "not_found['albums'].remove(None)\n", 370 | "not_found['albums'].append(-1)\n", 371 | "not_found['albums'] = [int(i) for i in not_found['albums']]\n", 372 | "not_found['artists'] = [int(i) for i in not_found['artists']]" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "tracks = tracks.merge(albums, left_on='album_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))\n", 382 | "\n", 383 | "n = sum(tracks['album_title_dup'].isnull())\n", 384 | "print('{} tracks without extended album information ({} tracks without album_id)'.format(\n", 385 | " n, sum(tracks['album_id'] == -1)))\n", 386 | "assert sum(tracks['album_id'].isin(not_found['albums'])) == n\n", 387 | "assert sum(tracks['album_title'] != tracks['album_title_dup']) == n\n", 388 | "\n", 389 | "tracks.drop('album_title_dup', axis=1, inplace=True)\n", 390 | "assert not any('dup' in col for col in tracks.columns)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "# Album artist can be different than track artist. Keep track artist.\n", 400 | "#tracks[tracks['artist_name'] != tracks['artist_name_dup']].select(lambda x: 'artist_name' in x, axis=1)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "tracks = tracks.merge(artists, left_on='artist_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))\n", 410 | "\n", 411 | "n = sum(tracks['artist_name_dup'].isnull())\n", 412 | "print('{} tracks without extended artist information'.format(n))\n", 413 | "assert sum(tracks['artist_id'].isin(not_found['artists'])) == n\n", 414 | "assert sum(tracks['artist_name'] != tracks[('artist_name_dup')]) == n\n", 415 | "\n", 416 | "tracks.drop('artist_name_dup', axis=1, inplace=True)\n", 417 | "assert not any('dup' in col for col in tracks.columns)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "columns = []\n", 427 | "for name in tracks.columns:\n", 428 | " names = name.split('_')\n", 429 | " columns.append((names[0], '_'.join(names[1:])))\n", 430 | "tracks.columns = pd.MultiIndex.from_tuples(columns)\n", 431 | "assert all(label in ['track', 'album', 'artist'] for label in tracks.columns.get_level_values(0))" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "# Todo: fill other columns ?\n", 441 | "tracks['album', 'tags'].fillna('[]', inplace=True)\n", 442 | "tracks['artist', 'tags'].fillna('[]', inplace=True)\n", 443 | "\n", 444 | "columns = [('album', 'favorites'), ('album', 'comments'), ('album', 'listens'), ('album', 'tracks'),\n", 445 | " ('artist', 'favorites'), ('artist', 'comments')]\n", 446 | "for column in columns:\n", 447 | " tracks[column].fillna(-1, inplace=True)\n", 448 | "columns = {column: int for column in columns}\n", 449 | "tracks = tracks.astype(columns)" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "## 3 Data cleaning\n", 457 | "\n", 458 | "Todo: duplicates (metadata and audio)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "def keep(index, df):\n", 468 | " old = len(df)\n", 469 | " df = df.loc[index]\n", 470 | " new = len(df)\n", 471 | " print('{} lost, {} left'.format(old - new, new))\n", 472 | " return df\n", 473 | "\n", 474 | "tracks = keep(tracks.index, tracks)" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "# Audio not found or could not be trimmed.\n", 484 | "tracks = keep(tracks.index.difference(not_found['audio']), tracks)\n", 485 | "tracks = keep(tracks.index.difference(not_found['clips']), tracks)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "Errors from the `features.py` script.\n", 493 | "* IndexError('index 0 is out of bounds for axis 0 with size 0',)\n", 494 | " * ffmpeg: Header missing\n", 495 | " * ffmpeg: Could not find codec parameters for stream 0 (Audio: mp3, 0 channels, s16p): unspecified frame size. Consider increasing the value for the 'analyzeduration' and 'probesize' options\n", 496 | " * tids: 117759\n", 497 | "* NoBackendError()\n", 498 | " * ffmpeg: Format mp3 detected only with low score of 1, misdetection possible!\n", 499 | " * tids: 80015, 115235\n", 500 | "* UserWarning('Trying to estimate tuning from empty frequency set.',)\n", 501 | " * librosa error\n", 502 | " * tids: 1440, 26436, 38903, 57603, 62095, 62954, 62956, 62957, 62959, 62971, 86079, 96426, 104623, 106719, 109714, 114501, 114528, 118003, 118004, 127827, 130298, 130296, 131076, 135804, 154923\n", 503 | "* ParameterError('Filter pass-band lies beyond Nyquist',)\n", 504 | " * librosa error\n", 505 | " * tids: 152204, 28106, 29166, 29167, 29169, 29168, 29170, 29171, 29172, 29173, 29179, 43903, 56757, 59361, 75461, 92346, 92345, 92347, 92349, 92350, 92351, 92353, 92348, 92352, 92354, 92355, 92356, 92358, 92359, 92361, 92360, 114448, 136486, 144769, 144770, 144771, 144773, 144774, 144775, 144778, 144776, 144777" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "# Feature extraction failed.\n", 515 | "FAILED = [1440, 26436, 28106, 29166, 29167, 29168, 29169, 29170, 29171, 29172,\n", 516 | " 29173, 29179, 38903, 43903, 56757, 57603, 59361, 62095, 62954, 62956,\n", 517 | " 62957, 62959, 62971, 75461, 80015, 86079, 92345, 92346, 92347, 92348,\n", 518 | " 92349, 92350, 92351, 92352, 92353, 92354, 92355, 92356, 92357, 92358,\n", 519 | " 92359, 92360, 92361, 96426, 104623, 106719, 109714, 114448, 114501,114528,\n", 520 | " 115235, 117759, 118003, 118004, 127827, 130296, 130298, 131076, 135804, 136486,\n", 521 | " 144769, 144770, 144771, 144773, 144774, 144775, 144776, 144777, 144778, 152204,\n", 522 | " 154923]\n", 523 | "tracks = keep(tracks.index.difference(FAILED), tracks)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "# License forbids redistribution.\n", 533 | "tracks = keep(tracks['track', 'license'] != 'FMA-Limited: Download Only', tracks)\n", 534 | "print('{} licenses'.format(len(tracks[('track', 'license')].unique())))" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "#sum(tracks['track', 'title'].duplicated())" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "## 4 Genres" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "genres.drop(['genre_handle', 'genre_color'], axis=1, inplace=True)\n", 560 | "genres.rename(columns={'genre_parent_id': 'parent', 'genre_title': 'title'}, inplace=True)" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "genres['parent'].fillna(0, inplace=True)\n", 570 | "genres = genres.astype({'parent': int})" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "# 13 (Easy Listening) has parent 126 which is missing\n", 580 | "# --> a root genre on the website, although not in the genre menu\n", 581 | "genres.at[13, 'parent'] = 0\n", 582 | "\n", 583 | "# 580 (Abstract Hip-Hop) has parent 1172 which is missing\n", 584 | "# --> listed as child of Hip-Hop on the website\n", 585 | "genres.at[580, 'parent'] = 21\n", 586 | "\n", 587 | "# 810 (Nu-Jazz) has parent 51 which is missing\n", 588 | "# --> listed as child of Easy Listening on website\n", 589 | "genres.at[810, 'parent'] = 13\n", 590 | "\n", 591 | "# 763 (Holiday) has parent 763 which is itself\n", 592 | "# --> listed as child of Sound Effects on website\n", 593 | "genres.at[763, 'parent'] = 16\n", 594 | "\n", 595 | "# Todo: should novelty be under Experimental? It is alone on website." 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [ 604 | "# Genre 806 (hiphop) should not exist. Replace it by 21 (Hip-Hop).\n", 605 | "print('{} tracks have genre 806'.format(\n", 606 | " sum(tracks['track', 'genres'].map(lambda genres: 806 in genres))))\n", 607 | "def change_genre(genres):\n", 608 | " return [genre if genre != 806 else 21 for genre in genres]\n", 609 | "tracks['track', 'genres'] = tracks['track', 'genres'].map(change_genre)\n", 610 | "genres.drop(806, inplace=True)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "def get_parent(genre, track_all_genres=None):\n", 620 | " parent = genres.at[genre, 'parent']\n", 621 | " if track_all_genres is not None:\n", 622 | " track_all_genres.append(genre)\n", 623 | " return genre if parent == 0 else get_parent(parent, track_all_genres)\n", 624 | "\n", 625 | "# Get all genres, i.e. all genres encountered when walking from leafs to roots.\n", 626 | "def get_all_genres(track_genres):\n", 627 | " track_all_genres = list()\n", 628 | " for genre in track_genres:\n", 629 | " get_parent(genre, track_all_genres)\n", 630 | " return list(set(track_all_genres))\n", 631 | "\n", 632 | "tracks['track', 'genres_all'] = tracks['track', 'genres'].map(get_all_genres)" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "# Number of tracks per genre.\n", 642 | "def count_genres(subset=tracks.index):\n", 643 | " count = pd.Series(0, index=genres.index)\n", 644 | " for _, track_all_genres in tracks.loc[subset, ('track', 'genres_all')].items():\n", 645 | " for genre in track_all_genres:\n", 646 | " count[genre] += 1\n", 647 | " return count\n", 648 | "\n", 649 | "genres['#tracks'] = count_genres()\n", 650 | "genres[genres['#tracks'] == 0]" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [ 659 | "def get_top_genre(track_genres):\n", 660 | " top_genres = set(genres.at[genres.at[genre, 'top_level'], 'title'] for genre in track_genres)\n", 661 | " return top_genres.pop() if len(top_genres) == 1 else np.nan\n", 662 | "\n", 663 | "# Top-level genre.\n", 664 | "genres['top_level'] = genres.index.map(get_parent)\n", 665 | "tracks['track', 'genre_top'] = tracks['track', 'genres'].map(get_top_genre)" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": null, 671 | "metadata": {}, 672 | "outputs": [], 673 | "source": [ 674 | "genres.head(10)" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "## 5 Subsets: large, medium, small" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "### 5.1 Large\n", 689 | "\n", 690 | "Main characteristic: the full set with clips trimmed to a manageable size." 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "### 5.2 Medium\n", 698 | "\n", 699 | "Main characteristic: clean metadata (includes 1 top-level genre) and quality audio." 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "metadata": {}, 706 | "outputs": [], 707 | "source": [ 708 | "fma_medium = pd.DataFrame(tracks)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "# Missing meta-information.\n", 718 | "\n", 719 | "# Missing extended album and artist information.\n", 720 | "fma_medium = keep(~fma_medium['album', 'id'].isin(not_found['albums']), fma_medium)\n", 721 | "fma_medium = keep(~fma_medium['artist', 'id'].isin(not_found['artists']), fma_medium)\n", 722 | "\n", 723 | "# Untitled track or album.\n", 724 | "fma_medium = keep(~fma_medium['track', 'title'].isnull(), fma_medium)\n", 725 | "fma_medium = keep(fma_medium['track', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)\n", 726 | "fma_medium = keep(fma_medium['album', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)\n", 727 | "\n", 728 | "# One tag is often just the artist name. Tags too scarce for tracks and albums.\n", 729 | "#keep(fma_medium['artist', 'tags'].map(len) >= 2, fma_medium)\n", 730 | "\n", 731 | "# Too scarce.\n", 732 | "#fma_medium = keep(~fma_medium['album', 'information'].isnull(), fma_medium)\n", 733 | "#fma_medium = keep(~fma_medium['artist', 'bio'].isnull(), fma_medium)\n", 734 | "#fma_medium = keep(~fma_medium['artist', 'website'].isnull(), fma_medium)\n", 735 | "#fma_medium = keep(~fma_medium['artist', 'wikipedia_page'].isnull(), fma_medium)\n", 736 | "\n", 737 | "# Too scarce.\n", 738 | "#fma_medium = keep(~fma_medium['artist', 'location'].isnull(), fma_medium)\n", 739 | "#fma_medium = keep(~fma_medium['artist', 'latitude'].isnull(), fma_medium)\n", 740 | "#fma_medium = keep(~fma_medium['artist', 'longitude'].isnull(), fma_medium)" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": null, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "# Technical quality.\n", 750 | "# Todo: sample rate\n", 751 | "fma_medium = keep(fma_medium['track', 'bit_rate'] > 100000, fma_medium)\n", 752 | "\n", 753 | "# Choosing standard bit rates discards all VBR.\n", 754 | "#fma_medium = keep(fma_medium['track', 'bit_rate'].isin([320000, 256000, 192000, 160000, 128000]), fma_medium)" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "fma_medium = keep(fma_medium['track', 'duration'] >= 60, fma_medium)\n", 764 | "fma_medium = keep(fma_medium['track', 'duration'] <= 600, fma_medium)\n", 765 | "\n", 766 | "fma_medium = keep(fma_medium['album', 'tracks'] >= 1, fma_medium)\n", 767 | "fma_medium = keep(fma_medium['album', 'tracks'] <= 50, fma_medium)" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "# Lower popularity bound.\n", 777 | "fma_medium = keep(fma_medium['track', 'listens'] >= 100, fma_medium)\n", 778 | "fma_medium = keep(fma_medium['track', 'interest'] >= 200, fma_medium)\n", 779 | "fma_medium = keep(fma_medium['album', 'listens'] >= 1000, fma_medium);\n", 780 | "\n", 781 | "# Favorites and comments are very scarce.\n", 782 | "#fma_medium = keep(fma_medium['artist', 'favorites'] >= 1, fma_medium)" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": null, 788 | "metadata": {}, 789 | "outputs": [], 790 | "source": [ 791 | "# Targeted genre classification.\n", 792 | "fma_medium = keep(~fma_medium['track', 'genre_top'].isnull(), fma_medium);\n", 793 | "#keep(fma_medium['track', 'genres'].map(len) == 1, fma_medium);" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "metadata": {}, 800 | "outputs": [], 801 | "source": [ 802 | "# Adjust size with popularity measure. Should be of better quality.\n", 803 | "N_TRACKS = 25000\n", 804 | "\n", 805 | "# Observations\n", 806 | "# * More albums killed than artists --> be sure not to kill diversity\n", 807 | "# * Favorites and preterites genres differently --> do it per genre?\n", 808 | "# Normalization\n", 809 | "# * mean, median, std, max\n", 810 | "# * tracks per album or artist\n", 811 | "# Test\n", 812 | "# * 4/5 of same tracks were selected with various set of measures\n", 813 | "# * <5% diff with max and mean\n", 814 | "\n", 815 | "popularity_measures = [('track', 'listens'), ('track', 'interest')] # ('album', 'listens')\n", 816 | "# ('track', 'favorites'), ('track', 'comments'),\n", 817 | "# ('album', 'favorites'), ('album', 'comments'),\n", 818 | "# ('artist', 'favorites'), ('artist', 'comments'),\n", 819 | "\n", 820 | "normalization = {measure: fma_medium[measure].max() for measure in popularity_measures}\n", 821 | "def popularity_measure(track):\n", 822 | " return sum(track[measure] / normalization[measure] for measure in popularity_measures)\n", 823 | "fma_medium['popularity_measure'] = fma_medium.apply(popularity_measure, axis=1)\n", 824 | "fma_medium = keep(fma_medium.sort_values('popularity_measure', ascending=False).index[:N_TRACKS], fma_medium)" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": null, 830 | "metadata": {}, 831 | "outputs": [], 832 | "source": [ 833 | "tmp = genres[genres['parent'] == 0].reset_index().set_index('title')\n", 834 | "tmp['#tracks_medium'] = fma_medium['track', 'genre_top'].value_counts()\n", 835 | "tmp.sort_values('#tracks_medium', ascending=False)" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "metadata": {}, 841 | "source": [ 842 | "### 5.3 Small\n", 843 | "\n", 844 | "Main characteristic: genre balanced (and echonest features).\n", 845 | "\n", 846 | "Choices:\n", 847 | "* 8 genres with 1000 tracks --> 8,000 tracks\n", 848 | "* 10 genres with 500 tracks --> 5,000 tracks\n", 849 | "\n", 850 | "Todo:\n", 851 | "* Download more echonest features so that all tracks can have them. Otherwise intersection of tracks with echonest features and one top-level genre is too small." 852 | ] 853 | }, 854 | { 855 | "cell_type": "code", 856 | "execution_count": null, 857 | "metadata": {}, 858 | "outputs": [], 859 | "source": [ 860 | "N_GENRES = 8\n", 861 | "N_TRACKS = 1000\n", 862 | "\n", 863 | "top_genres = tmp.sort_values('#tracks_medium', ascending=False)[:N_GENRES].index\n", 864 | "fma_small = pd.DataFrame(fma_medium)\n", 865 | "fma_small = keep(fma_small['track', 'genre_top'].isin(top_genres), fma_small)" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": null, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [ 874 | "to_keep = []\n", 875 | "for genre in top_genres:\n", 876 | " subset = fma_small[fma_small['track', 'genre_top'] == genre]\n", 877 | " drop = subset.sort_values('popularity_measure').index[:-N_TRACKS]\n", 878 | " fma_small.drop(drop, inplace=True)\n", 879 | "assert len(fma_small) == N_GENRES * N_TRACKS" 880 | ] 881 | }, 882 | { 883 | "cell_type": "markdown", 884 | "metadata": {}, 885 | "source": [ 886 | "### 5.4 Subset indication" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": null, 892 | "metadata": {}, 893 | "outputs": [], 894 | "source": [ 895 | "SUBSETS = ('small', 'medium', 'large')\n", 896 | "tracks['set', 'subset'] = pd.Series().astype('category', categories=SUBSETS, ordered=True)\n", 897 | "tracks.loc[tracks.index, ('set', 'subset')] = 'large'\n", 898 | "tracks.loc[fma_medium.index, ('set', 'subset')] = 'medium'\n", 899 | "tracks.loc[fma_small.index, ('set', 'subset')] = 'small'" 900 | ] 901 | }, 902 | { 903 | "cell_type": "markdown", 904 | "metadata": {}, 905 | "source": [ 906 | "### 5.5 Echonest" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": null, 912 | "metadata": {}, 913 | "outputs": [], 914 | "source": [ 915 | "echonest = pd.read_csv('raw_echonest.csv', index_col=0, header=[0, 1, 2])\n", 916 | "echonest = keep(~echonest['echonest', 'temporal_features'].isnull().any(axis=1), echonest)\n", 917 | "echonest = keep(~echonest['echonest', 'audio_features'].isnull().any(axis=1), echonest)\n", 918 | "echonest = keep(~echonest['echonest', 'social_features'].isnull().any(axis=1), echonest)\n", 919 | "\n", 920 | "echonest = keep(echonest.index.isin(tracks.index), echonest);\n", 921 | "keep(echonest.index.isin(fma_medium.index), echonest);\n", 922 | "keep(echonest.index.isin(fma_small.index), echonest);" 923 | ] 924 | }, 925 | { 926 | "cell_type": "markdown", 927 | "metadata": {}, 928 | "source": [ 929 | "## 6 Splits: training, validation, test\n", 930 | "\n", 931 | "Take into account:\n", 932 | "* Artists may only appear on one side.\n", 933 | "* Stratification: ideally, all characteristics (#tracks per artist, duration, sampling rate, information, bio) and targets (genres, tags) should be equally distributed." 934 | ] 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": null, 939 | "metadata": {}, 940 | "outputs": [], 941 | "source": [ 942 | "for genre in genres.index:\n", 943 | " tracks['genre', genres.at[genre, 'title']] = tracks['track', 'genres_all'].map(lambda genres: genre in genres)\n", 944 | "\n", 945 | "SPLITS = ('training', 'test', 'validation')\n", 946 | "PERCENTAGES = (0.8, 0.1, 0.1)\n", 947 | "tracks['set', 'split'] = pd.Series().astype('category', categories=SPLITS)\n", 948 | "\n", 949 | "for subset in SUBSETS:\n", 950 | "\n", 951 | " tracks_subset = tracks['set', 'subset'] <= subset\n", 952 | "\n", 953 | " # Consider only top-level genres for small and medium.\n", 954 | " genre_list = list(tracks.loc[tracks_subset, ('track', 'genre_top')].unique())\n", 955 | " if subset == 'large':\n", 956 | " genre_list = list(genres['title']) \n", 957 | "\n", 958 | " while True:\n", 959 | " if len(genre_list) == 0:\n", 960 | " break\n", 961 | "\n", 962 | " # Choose most constrained genre, i.e. genre with the least unassigned artists.\n", 963 | " tracks_unsplit = tracks['set', 'split'].isnull()\n", 964 | " count = tracks[tracks_subset & tracks_unsplit].set_index(('artist', 'id'), append=True)['genre']\n", 965 | " count = count.groupby(level=1).sum().astype(np.bool).sum()\n", 966 | " genre = np.argmin(count[genre_list])\n", 967 | " genre_list.remove(genre)\n", 968 | " \n", 969 | " # Given genre, select artists.\n", 970 | " tracks_genre = tracks['genre', genre] == 1\n", 971 | " artists = tracks.loc[tracks_genre & tracks_subset & tracks_unsplit, ('artist', 'id')].value_counts()\n", 972 | " #print('-->', genre, len(artists))\n", 973 | "\n", 974 | " current = {split: np.sum(tracks_genre & tracks_subset & (tracks['set', 'split'] == split)) for split in SPLITS}\n", 975 | "\n", 976 | " # Assign artists with most tracks first.\n", 977 | " for artist, count in artists.items():\n", 978 | " choice = np.argmin([current[split] / percentage for split, percentage in zip(SPLITS, PERCENTAGES)])\n", 979 | " current[SPLITS[choice]] += count\n", 980 | " #assert tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')].isnull().all()\n", 981 | " tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')] = SPLITS[choice]\n", 982 | "\n", 983 | "# Tracks without genre can only serve as unlabeled data for training, e.g. for semi-supervised algorithms.\n", 984 | "no_genres = tracks['track', 'genres_all'].map(lambda genres: len(genres) == 0)\n", 985 | "no_split = tracks['set', 'split'].isnull()\n", 986 | "assert not (no_split & ~no_genres).any()\n", 987 | "tracks.loc[no_split, ('set', 'split')] = 'training'\n", 988 | "\n", 989 | "# Not needed any more.\n", 990 | "tracks.drop('genre', axis=1, level=0, inplace=True)" 991 | ] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "metadata": {}, 996 | "source": [ 997 | "## 7 Store" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": null, 1003 | "metadata": {}, 1004 | "outputs": [], 1005 | "source": [ 1006 | "for dataset in 'tracks', 'genres', 'echonest':\n", 1007 | " eval(dataset).sort_index(axis=0, inplace=True)\n", 1008 | " eval(dataset).sort_index(axis=1, inplace=True)\n", 1009 | " params = dict(float_format='%.10f') if dataset == 'echonest' else dict()\n", 1010 | " eval(dataset).to_csv(dataset + '.csv', **params)" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": null, 1016 | "metadata": {}, 1017 | "outputs": [], 1018 | "source": [ 1019 | "# ./creation.py normalize /path/to/fma\n", 1020 | "# ./creation.py zips /path/to/fma" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "## 8 Description" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": null, 1033 | "metadata": {}, 1034 | "outputs": [], 1035 | "source": [ 1036 | "tracks = utils.load('tracks.csv')\n", 1037 | "tracks.dtypes" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "metadata": {}, 1044 | "outputs": [], 1045 | "source": [ 1046 | "N = 5\n", 1047 | "ipd.display(tracks['track'].head(N))\n", 1048 | "ipd.display(tracks['album'].head(N))\n", 1049 | "ipd.display(tracks['artist'].head(N))" 1050 | ] 1051 | } 1052 | ], 1053 | "metadata": {}, 1054 | "nbformat": 4, 1055 | "nbformat_minor": 2 1056 | } 1057 | --------------------------------------------------------------------------------