├── .binder
    ├── runtime.txt
    ├── apt.txt
    ├── start
    └── postBuild
├── data
    └── .gitignore
├── figures
    └── .gitignore
├── .gitignore
├── setup.py
├── LICENSE.txt
├── makefile
├── requirements.txt
├── webapi.ipynb
├── features.py
├── creation.py
├── usage.ipynb
├── utils.py
├── README.md
├── baselines.ipynb
├── analysis.ipynb
└── creation.ipynb


/.binder/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.6
2 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/.binder/apt.txt:
--------------------------------------------------------------------------------
1 | graphviz
2 | ffmpeg
3 | 


--------------------------------------------------------------------------------
/figures/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/.binder/start:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | export AUDIO_DIR=./data/fma_small/
3 | exec "$@"
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # IPython checkpoints
 6 | .ipynb_checkpoints/
 7 | 
 8 | # Environment
 9 | .env
10 | .python-version
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='freemusicarchive',
 4 |       version='0.0.0',
 5 |       description='Free Music Archive',
 6 |       url='https://github.com/mdeff/fma',
 7 |       author='Michaël Defferrard',
 8 |       author_email='michael.defferrard@epfl.ch',
 9 |       license='MIT')
10 | 


--------------------------------------------------------------------------------
/.binder/postBuild:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | python3.6 -m venv ./env
 4 | 
 5 | ./env/bin/pip install --upgrade pip setuptools wheel
 6 | ./env/bin/pip install numpy==1.12.1  # workaround resampy's bogus setup.py
 7 | ./env/bin/pip install -r requirements.txt
 8 | 
 9 | # Shadow the default kernelspec for jupyter to use our environment by default.
10 | ./env/bin/python -m ipykernel install --user
11 | 
12 | cd data
13 | curl -O https://os.unil.cloud.switch.ch/fma/fma_metadata.zip
14 | unzip fma_metadata.zip
15 | rm fma_metadata.zip
16 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Michaël Defferrard
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | NB = $(sort $(wildcard *.ipynb))
 2 | 
 3 | run: $(NB)
 4 | 
 5 | $(NB):
 6 | 	jupyter nbconvert --inplace --execute --ExecutePreprocessor.timeout=-1 $@
 7 | 
 8 | clean:
 9 | 	rm -rf __pycache__/ .ipynb_checkpoints/
10 | 	#jupyter nbconvert --inplace --ClearOutputPreprocessor.enabled=True $(NB)
11 | 	@for nb in $(NB); do \
12 | 		echo "$$(jq --indent 1 ' \
13 | 			.metadata = {} \
14 | 			| (.cells[] | select(has("outputs")) | .outputs) = [] \
15 | 			| (.cells[] | select(has("execution_count")) | .execution_count) = null \
16 | 			| .cells[].metadata = {} \
17 | 			' $$nb)" > $$nb; \
18 | 	done
19 | 
20 | # May be useful to keep for nbsphinx.
21 | # | .metadata = {"language_info": {"name": "python", "pygments_lexer": "ipython3"}} \
22 | 
23 | install:
24 | 	pip install --upgrade pip setuptools wheel
25 | 	pip install numpy==1.12.1  # bug: resampy imports numpy in setup.py
26 | 	# pip install setuptools==38.2.4  # MarkupSafe 1.0 setup.py needs `from setuptools import Feature`
27 | 	pip install -r requirements.txt
28 | 
29 | readme:
30 | 	grip README.md
31 | 
32 | html:
33 | 	grip --export README.md
34 | 	jupyter nbconvert $(NB) --to html
35 | 
36 | .PHONY: run $(NB) clean install readme html
37 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Version numbers have been retrieved from a range of machines and environments.
 2 | # Take them with a grain of salt.
 3 | 
 4 | # Direct dependencies
 5 | #python==3.6.0
 6 | #pip==9.0.1
 7 | #setuptools==38.2.4  # old for MarkupSafe 1.0 (28.8.0 is installed with py 3.6)
 8 | numpy==1.12.1  # 1.12.0
 9 | pandas==0.19.2
10 | matplotlib==2.0.0
11 | seaborn==0.7.1
12 | scikit-learn==0.18.1
13 | tensorflow-gpu==1.0.1  # 1.0.0
14 | Keras==1.2.2  # 2.0.2 / 2.0.3
15 | librosa==0.5.0
16 | audioread==2.1.4
17 | mutagen==1.39  # 1.39.dev0
18 | pydub==0.18.0
19 | #exiftool  # Only considered at some point.
20 | #eyed3  # Only considered at some point.
21 | requests==2.13.0
22 | pydot==1.2.3
23 | tqdm==4.11.2
24 | python-dotenv==0.6.3  # 0.6.4
25 | 
26 | # Dependencies of the above.
27 | certifi==2017.11.5
28 | click==6.7
29 | cycler==0.10.0
30 | Cython==0.25.2
31 | decorator==4.0.11  # 4.1.2
32 | joblib==0.11
33 | protobuf==3.2.0
34 | pyparsing==2.2.0
35 | python-dateutil==2.6.0
36 | pytz==2017.2  # 2016.10 / 2017.3
37 | PyYAML==3.12
38 | resampy==0.1.5
39 | scipy==0.19.0  # 0.18.1
40 | six==1.10.0
41 | Theano==0.9.0  # 0.8.2
42 | 
43 | # Jupyter notebook and its dependencies.
44 | notebook==5.0.0  # 4.4.1
45 | ipywidgets==6.0.0
46 | bleach==2.0.0  # 1.5.0
47 | entrypoints==0.2.2
48 | html5lib==0.999999999  # 0.9999999
49 | ipykernel==4.6.0  # 4.5.2 / 4.6.1
50 | ipython==5.3.0
51 | ipython-genutils==0.2.0  # 0.1.0
52 | Jinja2==2.9.6  # 2.9.5 / 2.10
53 | jsonschema==2.6.0
54 | jupyter-client==5.0.0  # 5.0.1
55 | jupyter-core==4.3.0
56 | MarkupSafe==0.23  # 1.0 (requires an old setuptools for `from setuptools import Feature`)
57 | mistune==0.7.4  # 0.7.3
58 | nbconvert==5.1.1
59 | nbformat==4.3.0
60 | packaging==16.8
61 | pandocfilters==1.4.1
62 | pexpect==4.2.1
63 | pickleshare==0.7.4
64 | prompt-toolkit==1.0.14  # 1.0.13
65 | ptyprocess==0.5.1
66 | Pygments==2.2.0
67 | pyparsing==2.2.0
68 | pyzmq==16.0.2
69 | simplegeneric==0.8.1
70 | terminado==0.6
71 | testpath==0.3
72 | tornado==4.4.3  # 4.4.2
73 | traitlets==4.3.2
74 | wcwidth==0.1.7
75 | webencodings==0.5.1
76 | widgetsnbextension==2.0.0
77 | 


--------------------------------------------------------------------------------
/webapi.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n",
  8 |     "\n",
  9 |     "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n",
 10 |     "\n",
 11 |     "## Free Music Archive web API\n",
 12 |     "\n",
 13 |     "All the data in the `raw_*.csv` tables was collected from the Free Music Archive [public API](https://freemusicarchive.org/api). With this notebook, you can:\n",
 14 |     "* reconstruct the original data, \n",
 15 |     "* update some fields, e.g. the `track listens` (play count),\n",
 16 |     "* augment the data with newer fields wich may have been introduced in their API,\n",
 17 |     "* update the dataset with new songs added to the archive.\n",
 18 |     "\n",
 19 |     "Notes:\n",
 20 |     "* You need a key to access the API, which you can [request online](https://freemusicarchive.org/api/agreement) and write into your `.env` file as a new line reading `FMA_KEY=MYPERSONALKEY`.\n",
 21 |     "* Requests take some hunderd milliseconds to complete."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import os\n",
 31 |     "import IPython.display as ipd\n",
 32 |     "import utils"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "fma = utils.FreeMusicArchive(os.environ.get('FMA_KEY'))"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "## 1 Get recently added tracks\n",
 49 |     "\n",
 50 |     "* `track_id` are assigned in monotonically increasing order.\n",
 51 |     "* Tracks can be removed, so that number does not indicate the number of available tracks."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "for track_id, artist_name, date_created in zip(*fma.get_recent_tracks()):\n",
 61 |     "    print(track_id, date_created, artist_name)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## 2 Get metadata about tracks, albums and artists\n",
 69 |     "\n",
 70 |     "Given IDs, we can get information about tracks, albums and artists. See the available fields in the [API documentation](https://freemusicarchive.org/api)."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "fma.get_track(track_id=2, fields=['track_title', 'track_date_created',\n",
 80 |     "                                  'track_duration', 'track_bit_rate',\n",
 81 |     "                                  'track_listens', 'track_interest', 'track_comments', 'track_favorites',\n",
 82 |     "                                  'artist_id', 'album_id'])"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "fma.get_track_genres(track_id=20)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "fma.get_album(album_id=1, fields=['album_title', 'album_tracks',\n",
101 |     "                                  'album_listens', 'album_comments', 'album_favorites',\n",
102 |     "                                  'album_date_created', 'album_date_released'])"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "fma.get_artist(artist_id=1, fields=['artist_name', 'artist_location',\n",
112 |     "                                    'artist_comments', 'artist_favorites'])"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "## 3 Get data, i.e. raw audio\n",
120 |     "\n",
121 |     "We can download the original audio as well. Tracks are provided by the archive as MP3 with various bit and sample rates."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "track_file = fma.get_track(2, 'track_file')\n",
131 |     "fma.download_track(track_file, path='track.mp3')"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## 4 Get genres\n",
139 |     "\n",
140 |     "Instead of compiling the genres of each track, we can get all the genres present on the archive with some API calls."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "genres = fma.get_all_genres()\n",
150 |     "print('{} genres'.format(genres.shape[0]))\n",
151 |     "genres[10:25]"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "And look for genres related to Rock."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "genres[['Rock' in title for title in genres['genre_title']]]"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "genres[genres['genre_parent_id'] == '12']"
177 |    ]
178 |   }
179 |  ],
180 |  "metadata": {},
181 |  "nbformat": 4,
182 |  "nbformat_minor": 2
183 | }
184 | 


--------------------------------------------------------------------------------
/features.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # FMA: A Dataset For Music Analysis
  4 | # Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.
  5 | 
  6 | # All features are extracted using [librosa](https://github.com/librosa/librosa).
  7 | # Alternatives:
  8 | # * [Essentia](http://essentia.upf.edu) (C++ with Python bindings)
  9 | # * [MARSYAS](https://github.com/marsyas/marsyas) (C++ with Python bindings)
 10 | # * [RP extract](http://www.ifs.tuwien.ac.at/mir/downloads.html) (Matlab, Java, Python)
 11 | # * [jMIR jAudio](http://jmir.sourceforge.net) (Java)
 12 | # * [MIRtoolbox](https://www.jyu.fi/hum/laitokset/musiikki/en/research/coe/materials/mirtoolbox) (Matlab)
 13 | 
 14 | import os
 15 | import multiprocessing
 16 | import warnings
 17 | import numpy as np
 18 | from scipy import stats
 19 | import pandas as pd
 20 | import librosa
 21 | from tqdm import tqdm
 22 | import utils
 23 | 
 24 | 
 25 | def columns():
 26 |     feature_sizes = dict(chroma_stft=12, chroma_cqt=12, chroma_cens=12,
 27 |                          tonnetz=6, mfcc=20, rmse=1, zcr=1,
 28 |                          spectral_centroid=1, spectral_bandwidth=1,
 29 |                          spectral_contrast=7, spectral_rolloff=1)
 30 |     moments = ('mean', 'std', 'skew', 'kurtosis', 'median', 'min', 'max')
 31 | 
 32 |     columns = []
 33 |     for name, size in feature_sizes.items():
 34 |         for moment in moments:
 35 |             it = ((name, moment, '{:02d}'.format(i+1)) for i in range(size))
 36 |             columns.extend(it)
 37 | 
 38 |     names = ('feature', 'statistics', 'number')
 39 |     columns = pd.MultiIndex.from_tuples(columns, names=names)
 40 | 
 41 |     # More efficient to slice if indexes are sorted.
 42 |     return columns.sort_values()
 43 | 
 44 | 
 45 | def compute_features(tid):
 46 | 
 47 |     features = pd.Series(index=columns(), dtype=np.float32, name=tid)
 48 | 
 49 |     # Catch warnings as exceptions (audioread leaks file descriptors).
 50 |     warnings.filterwarnings('error', module='librosa')
 51 | 
 52 |     def feature_stats(name, values):
 53 |         features[name, 'mean'] = np.mean(values, axis=1)
 54 |         features[name, 'std'] = np.std(values, axis=1)
 55 |         features[name, 'skew'] = stats.skew(values, axis=1)
 56 |         features[name, 'kurtosis'] = stats.kurtosis(values, axis=1)
 57 |         features[name, 'median'] = np.median(values, axis=1)
 58 |         features[name, 'min'] = np.min(values, axis=1)
 59 |         features[name, 'max'] = np.max(values, axis=1)
 60 | 
 61 |     try:
 62 |         filepath = utils.get_audio_path(os.environ.get('AUDIO_DIR'), tid)
 63 |         x, sr = librosa.load(filepath, sr=None, mono=True)  # kaiser_fast
 64 | 
 65 |         f = librosa.feature.zero_crossing_rate(x, frame_length=2048, hop_length=512)
 66 |         feature_stats('zcr', f)
 67 | 
 68 |         cqt = np.abs(librosa.cqt(x, sr=sr, hop_length=512, bins_per_octave=12,
 69 |                                  n_bins=7*12, tuning=None))
 70 |         assert cqt.shape[0] == 7 * 12
 71 |         assert np.ceil(len(x)/512) <= cqt.shape[1] <= np.ceil(len(x)/512)+1
 72 | 
 73 |         f = librosa.feature.chroma_cqt(C=cqt, n_chroma=12, n_octaves=7)
 74 |         feature_stats('chroma_cqt', f)
 75 |         f = librosa.feature.chroma_cens(C=cqt, n_chroma=12, n_octaves=7)
 76 |         feature_stats('chroma_cens', f)
 77 |         f = librosa.feature.tonnetz(chroma=f)
 78 |         feature_stats('tonnetz', f)
 79 | 
 80 |         del cqt
 81 |         stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
 82 |         assert stft.shape[0] == 1 + 2048 // 2
 83 |         assert np.ceil(len(x)/512) <= stft.shape[1] <= np.ceil(len(x)/512)+1
 84 |         del x
 85 | 
 86 |         f = librosa.feature.chroma_stft(S=stft**2, n_chroma=12)
 87 |         feature_stats('chroma_stft', f)
 88 | 
 89 |         f = librosa.feature.rmse(S=stft)
 90 |         feature_stats('rmse', f)
 91 | 
 92 |         f = librosa.feature.spectral_centroid(S=stft)
 93 |         feature_stats('spectral_centroid', f)
 94 |         f = librosa.feature.spectral_bandwidth(S=stft)
 95 |         feature_stats('spectral_bandwidth', f)
 96 |         f = librosa.feature.spectral_contrast(S=stft, n_bands=6)
 97 |         feature_stats('spectral_contrast', f)
 98 |         f = librosa.feature.spectral_rolloff(S=stft)
 99 |         feature_stats('spectral_rolloff', f)
100 | 
101 |         mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
102 |         del stft
103 |         f = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
104 |         feature_stats('mfcc', f)
105 | 
106 |     except Exception as e:
107 |         print('{}: {}'.format(tid, repr(e)))
108 | 
109 |     return features
110 | 
111 | 
112 | def main():
113 |     tracks = utils.load('tracks.csv')
114 |     features = pd.DataFrame(index=tracks.index,
115 |                             columns=columns(), dtype=np.float32)
116 | 
117 |     # More than usable CPUs to be CPU bound, not I/O bound. Beware memory.
118 |     nb_workers = int(1.5 * len(os.sched_getaffinity(0)))
119 | 
120 |     # Longest is ~11,000 seconds. Limit processes to avoid memory errors.
121 |     table = ((5000, 1), (3000, 3), (2000, 5), (1000, 10), (0, nb_workers))
122 |     for duration, nb_workers in table:
123 |         print('Working with {} processes.'.format(nb_workers))
124 | 
125 |         tids = tracks[tracks['track', 'duration'] >= duration].index
126 |         tracks.drop(tids, axis=0, inplace=True)
127 | 
128 |         pool = multiprocessing.Pool(nb_workers)
129 |         it = pool.imap_unordered(compute_features, tids)
130 | 
131 |         for i, row in enumerate(tqdm(it, total=len(tids))):
132 |             features.loc[row.name] = row
133 | 
134 |             if i % 1000 == 0:
135 |                 save(features, 10)
136 | 
137 |     save(features, 10)
138 |     test(features, 10)
139 | 
140 | 
141 | def save(features, ndigits):
142 | 
143 |     # Should be done already, just to be sure.
144 |     features.sort_index(axis=0, inplace=True)
145 |     features.sort_index(axis=1, inplace=True)
146 | 
147 |     features.to_csv('features.csv', float_format='%.{}e'.format(ndigits))
148 | 
149 | 
150 | def test(features, ndigits):
151 | 
152 |     indices = features[features.isnull().any(axis=1)].index
153 |     if len(indices) > 0:
154 |         print('Failed tracks: {}'.format(', '.join(str(i) for i in indices)))
155 | 
156 |     tmp = utils.load('features.csv')
157 |     np.testing.assert_allclose(tmp.values, features.values, rtol=10**-ndigits)
158 | 
159 | 
160 | if __name__ == "__main__":
161 |     main()
162 | 


--------------------------------------------------------------------------------
/creation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # FMA: A Dataset For Music Analysis
  4 | # Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.
  5 | 
  6 | import os
  7 | import sys
  8 | import shutil
  9 | import pickle
 10 | import zipfile
 11 | import subprocess as sp
 12 | from datetime import datetime
 13 | from tqdm import tqdm, trange
 14 | import pandas as pd
 15 | import utils
 16 | 
 17 | 
 18 | TIME = datetime(2017, 4, 1).timestamp()
 19 | 
 20 | README = """This .zip archive is part of the FMA, a dataset for music analysis.
 21 | Code & data: https://github.com/mdeff/fma
 22 | Paper: https://arxiv.org/abs/1612.01840
 23 | 
 24 | Each .mp3 is licensed by its artist.
 25 | 
 26 | The content's integrity can be verified with sha1sum -c checksums.
 27 | """
 28 | 
 29 | 
 30 | def download_metadata():
 31 | 
 32 |     fma = utils.FreeMusicArchive(os.environ.get('FMA_KEY'))
 33 | 
 34 |     max_tid = int(fma.get_recent_tracks()[0][0])
 35 |     print('Largest track id: {}'.format(max_tid))
 36 | 
 37 |     not_found = {}
 38 | 
 39 |     id_range = trange(max_tid, desc='tracks')
 40 |     tracks, not_found['tracks'] = fma.get_all('track', id_range)
 41 | 
 42 |     id_range = tqdm(tracks['album_id'].unique(), desc='albums')
 43 |     albums, not_found['albums'] = fma.get_all('album', id_range)
 44 | 
 45 |     id_range = tqdm(tracks['artist_id'].unique(), desc='artists')
 46 |     artists, not_found['artists'] = fma.get_all('artist', id_range)
 47 | 
 48 |     genres = fma.get_all_genres()
 49 | 
 50 |     for dataset in 'tracks', 'albums', 'artists', 'genres':
 51 |         eval(dataset).sort_index(axis=0, inplace=True)
 52 |         eval(dataset).sort_index(axis=1, inplace=True)
 53 |         eval(dataset).to_csv('raw_' + dataset + '.csv')
 54 | 
 55 |     pickle.dump(not_found, open('not_found.pickle', 'wb'))
 56 | 
 57 | 
 58 | def _create_subdirs(dst_dir, tracks):
 59 | 
 60 |     # Get write access.
 61 |     if not os.path.exists(dst_dir):
 62 |         os.makedirs(dst_dir)
 63 |     os.chmod(dst_dir, 0o777)
 64 | 
 65 |     # Create writable sub-directories.
 66 |     n_folders = max(tracks.index) // 1000 + 1
 67 |     for folder in range(n_folders):
 68 |         dst = os.path.join(dst_dir, '{:03d}'.format(folder))
 69 |         if not os.path.exists(dst):
 70 |             os.makedirs(dst)
 71 |         os.chmod(dst, 0o777)
 72 | 
 73 | 
 74 | def download_data(dst_dir):
 75 | 
 76 |     dst_dir = os.path.abspath(dst_dir)
 77 |     tracks = pd.read_csv('raw_tracks.csv', index_col=0)
 78 |     _create_subdirs(dst_dir, tracks)
 79 | 
 80 |     fma = utils.FreeMusicArchive(os.environ.get('FMA_KEY'))
 81 |     not_found = pickle.load(open('not_found.pickle', 'rb'))
 82 |     not_found['audio'] = []
 83 | 
 84 |     # Download missing tracks.
 85 |     for tid in tqdm(tracks.index):
 86 |         dst = utils.get_audio_path(dst_dir, tid)
 87 |         if not os.path.exists(dst):
 88 |             try:
 89 |                 fma.download_track(tracks.at[tid, 'track_file'], dst)
 90 |             except:  # requests.HTTPError
 91 |                 not_found['audio'].append(tid)
 92 | 
 93 |     pickle.dump(not_found, open('not_found.pickle', 'wb'))
 94 | 
 95 | 
 96 | def convert_duration(x):
 97 |     times = x.split(':')
 98 |     seconds = int(times[-1])
 99 |     minutes = int(times[-2])
100 |     try:
101 |         minutes += 60 * int(times[-3])
102 |     except IndexError:
103 |         pass
104 |     return seconds + 60 * minutes
105 | 
106 | 
107 | def trim_audio(dst_dir):
108 | 
109 |     dst_dir = os.path.abspath(dst_dir)
110 |     fma_full = os.path.join(dst_dir, 'fma_full')
111 |     fma_large = os.path.join(dst_dir, 'fma_large')
112 |     tracks = pd.read_csv('raw_tracks.csv', index_col=0)
113 |     _create_subdirs(fma_large, tracks)
114 | 
115 |     not_found = pickle.load(open('not_found.pickle', 'rb'))
116 |     not_found['clips'] = []
117 | 
118 |     for tid in tqdm(tracks.index):
119 |         duration = convert_duration(tracks.at[tid, 'track_duration'])
120 |         src = utils.get_audio_path(fma_full, tid)
121 |         dst = utils.get_audio_path(fma_large, tid)
122 |         if tid in not_found['audio']:
123 |             continue
124 |         elif os.path.exists(dst):
125 |             continue
126 |         elif duration <= 30:
127 |             shutil.copyfile(src, dst)
128 |         else:
129 |             start = duration // 2 - 15
130 |             command = ['ffmpeg', '-i', src,
131 |                        '-ss', str(start), '-t', '30',
132 |                        '-acodec', 'copy', dst]
133 |             try:
134 |                 sp.run(command, check=True, stderr=sp.DEVNULL)
135 |             except sp.CalledProcessError:
136 |                 not_found['clips'].append(tid)
137 | 
138 |     for tid in not_found['clips']:
139 |         try:
140 |             os.remove(utils.get_audio_path(fma_large, tid))
141 |         except FileNotFoundError:
142 |             pass
143 | 
144 |     pickle.dump(not_found, open('not_found.pickle', 'wb'))
145 | 
146 | 
147 | def normalize_permissions_times(dst_dir):
148 |     dst_dir = os.path.abspath(dst_dir)
149 |     for dirpath, dirnames, filenames in tqdm(os.walk(dst_dir)):
150 |         for name in filenames:
151 |             dst = os.path.join(dirpath, name)
152 |             os.chmod(dst, 0o444)
153 |             os.utime(dst, (TIME, TIME))
154 |         for name in dirnames:
155 |             dst = os.path.join(dirpath, name)
156 |             os.chmod(dst, 0o555)
157 |             os.utime(dst, (TIME, TIME))
158 | 
159 | 
160 | def create_zips(dst_dir):
161 | 
162 |     def get_filepaths(subset):
163 |         filepaths = []
164 |         tids = tracks.index[tracks['set', 'subset'] <= subset]
165 |         for tid in tids:
166 |             filepaths.append(utils.get_audio_path('', tid))
167 |         return filepaths
168 | 
169 |     def get_checksums(base_dir, filepaths):
170 |         """Checksums are assumed to be stored in order for efficiency."""
171 |         checksums = []
172 |         with open(os.path.join(dst_dir, base_dir, 'checksums')) as f:
173 |             for filepath in filepaths:
174 |                 exist = False
175 |                 for line in f:
176 |                     if filepath == line[42:-1]:
177 |                         exist = True
178 |                         break
179 |                 if not exist:
180 |                     raise ValueError('checksum not found: {}'.format(filepath))
181 |                 checksums.append(line)
182 |         return checksums
183 | 
184 |     def create_zip(zip_filename, base_dir, filepaths):
185 | 
186 |         # Audio: all compressions are the same.
187 |         # CSV: stored > deflated > BZIP2 > LZMA.
188 |         # LZMA is close to BZIP2 and too recent to be widely available (unzip).
189 |         compression = zipfile.ZIP_BZIP2
190 | 
191 |         zip_filepath = os.path.join(dst_dir, zip_filename)
192 |         with zipfile.ZipFile(zip_filepath, 'x', compression) as zf:
193 | 
194 |             def info(name):
195 |                 name = os.path.join(zip_filename[:-4], name)
196 |                 info = zipfile.ZipInfo(name, (2017, 4, 1, 0, 0, 0))
197 |                 info.external_attr = 0o444 << 16 | 0o2 << 30
198 |                 return info
199 | 
200 |             zf.writestr(info('README.txt'), README, compression)
201 | 
202 |             checksums = get_checksums(base_dir, filepaths)
203 |             zf.writestr(info('checksums'), ''.join(checksums), compression)
204 | 
205 |             for filepath in tqdm(filepaths):
206 |                 src = os.path.join(dst_dir, base_dir, filepath)
207 |                 dst = os.path.join(zip_filename[:-4], filepath)
208 |                 zf.write(src, dst)
209 | 
210 |         os.chmod(zip_filepath, 0o444)
211 |         os.utime(zip_filepath, (TIME, TIME))
212 | 
213 |     METADATA = [
214 |         'not_found.pickle',
215 |         'raw_genres.csv', 'raw_albums.csv',
216 |         'raw_artists.csv', 'raw_tracks.csv',
217 |         'tracks.csv', 'genres.csv',
218 |         'raw_echonest.csv', 'echonest.csv', 'features.csv',
219 |     ]
220 |     create_zip('fma_metadata.zip', 'fma_metadata', METADATA)
221 | 
222 |     tracks = utils.load('tracks.csv')
223 |     create_zip('fma_small.zip', 'fma_large', get_filepaths('small'))
224 |     create_zip('fma_medium.zip', 'fma_large', get_filepaths('medium'))
225 |     create_zip('fma_large.zip', 'fma_large', get_filepaths('large'))
226 |     create_zip('fma_full.zip', 'fma_full', get_filepaths('large'))
227 | 
228 | 
229 | if __name__ == "__main__":
230 |     if sys.argv[1] == 'metadata':
231 |         download_metadata()
232 |     elif sys.argv[1] == 'data':
233 |         download_data(sys.argv[2])
234 |     elif sys.argv[1] == 'clips':
235 |         trim_audio(sys.argv[2])
236 |     elif sys.argv[1] == 'normalize':
237 |         normalize_permissions_times(sys.argv[2])
238 |     elif sys.argv[1] == 'zips':
239 |         create_zips(sys.argv[2])
240 | 


--------------------------------------------------------------------------------
/usage.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n",
  8 |     "\n",
  9 |     "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n",
 10 |     "\n",
 11 |     "## Usage\n",
 12 |     "\n",
 13 |     "1. Go through the [paper] to understand what the data is about.\n",
 14 |     "1. Download some datasets from <https://github.com/mdeff/fma>.\n",
 15 |     "1. Uncompress the archives, e.g. with `unzip fma_small.zip`.\n",
 16 |     "1. Load and play with the data in this notebook.\n",
 17 |     "\n",
 18 |     "[paper]: https://arxiv.org/abs/1612.01840"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "%matplotlib inline\n",
 28 |     "\n",
 29 |     "import os\n",
 30 |     "\n",
 31 |     "import IPython.display as ipd\n",
 32 |     "import numpy as np\n",
 33 |     "import pandas as pd\n",
 34 |     "import matplotlib.pyplot as plt\n",
 35 |     "import seaborn as sns\n",
 36 |     "import sklearn as skl\n",
 37 |     "import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm\n",
 38 |     "import librosa\n",
 39 |     "import librosa.display\n",
 40 |     "\n",
 41 |     "import utils\n",
 42 |     "\n",
 43 |     "plt.rcParams['figure.figsize'] = (17, 5)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# Directory where mp3 are stored.\n",
 53 |     "AUDIO_DIR = os.environ.get('AUDIO_DIR')\n",
 54 |     "\n",
 55 |     "# Load metadata and features.\n",
 56 |     "tracks = utils.load('data/fma_metadata/tracks.csv')\n",
 57 |     "genres = utils.load('data/fma_metadata/genres.csv')\n",
 58 |     "features = utils.load('data/fma_metadata/features.csv')\n",
 59 |     "echonest = utils.load('data/fma_metadata/echonest.csv')\n",
 60 |     "\n",
 61 |     "np.testing.assert_array_equal(features.index, tracks.index)\n",
 62 |     "assert echonest.index.isin(tracks.index).all()\n",
 63 |     "\n",
 64 |     "tracks.shape, genres.shape, features.shape, echonest.shape"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## 1 Metadata\n",
 72 |     "\n",
 73 |     "The metadata table, a CSV file in the `fma_metadata.zip` archive, is composed of many colums:\n",
 74 |     "1. The index is the ID of the song, taken from the website, used as the name of the audio file.\n",
 75 |     "2. Per-track, per-album and per-artist metadata from the Free Music Archive website.\n",
 76 |     "3. Two columns to indicate the subset (small, medium, large) and the split (training, validation, test)."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "ipd.display(tracks['track'].head())\n",
 86 |     "ipd.display(tracks['album'].head())\n",
 87 |     "ipd.display(tracks['artist'].head())\n",
 88 |     "ipd.display(tracks['set'].head())"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "### 1.1 Subsets\n",
 96 |     "\n",
 97 |     "The small and medium subsets can be selected with the below code."
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "small = tracks[tracks['set', 'subset'] <= 'small']\n",
107 |     "small.shape"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "medium = tracks[tracks['set', 'subset'] <= 'medium']\n",
117 |     "medium.shape"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## 2 Genres\n",
125 |     "\n",
126 |     "The genre hierarchy is stored in `genres.csv` and distributed in `fma_metadata.zip`."
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "print('{} top-level genres'.format(len(genres['top_level'].unique())))\n",
136 |     "genres.loc[genres['top_level'].unique()].sort_values('#tracks', ascending=False)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "genres.sort_values('#tracks').head(10)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "## 3 Features\n",
153 |     "\n",
154 |     "1. Features extracted from the audio for all tracks.\n",
155 |     "2. For some tracks, data colected from the [Echonest](http://the.echonest.com/) API."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "print('{1} features for {0} tracks'.format(*features.shape))\n",
165 |     "columns = ['mfcc', 'chroma_cens', 'tonnetz', 'spectral_contrast']\n",
166 |     "columns.append(['spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff'])\n",
167 |     "columns.append(['rmse', 'zcr'])\n",
168 |     "for column in columns:\n",
169 |     "    ipd.display(features[column].head().style.format('{:.2f}'))"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "### 3.1 Echonest features"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "print('{1} features for {0} tracks'.format(*echonest.shape))\n",
186 |     "ipd.display(echonest['echonest', 'metadata'].head())\n",
187 |     "ipd.display(echonest['echonest', 'audio_features'].head())\n",
188 |     "ipd.display(echonest['echonest', 'social_features'].head())\n",
189 |     "ipd.display(echonest['echonest', 'ranks'].head())"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "ipd.display(echonest['echonest', 'temporal_features'].head())\n",
199 |     "x = echonest.loc[2, ('echonest', 'temporal_features')]\n",
200 |     "plt.plot(x);"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "### 3.2 Features like MFCCs are discriminant"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "small = tracks['set', 'subset'] <= 'small'\n",
217 |     "genre1 = tracks['track', 'genre_top'] == 'Instrumental'\n",
218 |     "genre2 = tracks['track', 'genre_top'] == 'Hip-Hop'\n",
219 |     "\n",
220 |     "X = features.loc[small & (genre1 | genre2), 'mfcc']\n",
221 |     "X = skl.decomposition.PCA(n_components=2).fit_transform(X)\n",
222 |     "\n",
223 |     "y = tracks.loc[small & (genre1 | genre2), ('track', 'genre_top')]\n",
224 |     "y = skl.preprocessing.LabelEncoder().fit_transform(y)\n",
225 |     "\n",
226 |     "plt.scatter(X[:,0], X[:,1], c=y, cmap='RdBu', alpha=0.5)\n",
227 |     "X.shape, y.shape"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": [
234 |     "## 4 Audio\n",
235 |     "\n",
236 |     "You can load the waveform and listen to audio in the notebook itself."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "filename = utils.get_audio_path(AUDIO_DIR, 2)\n",
246 |     "print('File: {}'.format(filename))\n",
247 |     "\n",
248 |     "x, sr = librosa.load(filename, sr=None, mono=True)\n",
249 |     "print('Duration: {:.2f}s, {} samples'.format(x.shape[-1] / sr, x.size))\n",
250 |     "\n",
251 |     "start, end = 7, 17\n",
252 |     "ipd.Audio(data=x[start*sr:end*sr], rate=sr)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "And use [librosa](https://github.com/librosa/librosa) to compute spectrograms and audio features."
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "librosa.display.waveplot(x, sr, alpha=0.5);\n",
269 |     "plt.vlines([start, end], -1, 1)\n",
270 |     "\n",
271 |     "start = len(x) // 2\n",
272 |     "plt.figure()\n",
273 |     "plt.plot(x[start:start+2000])\n",
274 |     "plt.ylim((-1, 1));"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))\n",
284 |     "mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)\n",
285 |     "log_mel = librosa.logamplitude(mel)\n",
286 |     "\n",
287 |     "librosa.display.specshow(log_mel, sr=sr, hop_length=512, x_axis='time', y_axis='mel');"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)\n",
297 |     "mfcc = skl.preprocessing.StandardScaler().fit_transform(mfcc)\n",
298 |     "librosa.display.specshow(mfcc, sr=sr, x_axis='time');"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "## 5 Genre classification"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### 5.1 From features"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "small = tracks['set', 'subset'] <= 'small'\n",
322 |     "\n",
323 |     "train = tracks['set', 'split'] == 'training'\n",
324 |     "val = tracks['set', 'split'] == 'validation'\n",
325 |     "test = tracks['set', 'split'] == 'test'\n",
326 |     "\n",
327 |     "y_train = tracks.loc[small & train, ('track', 'genre_top')]\n",
328 |     "y_test = tracks.loc[small & test, ('track', 'genre_top')]\n",
329 |     "X_train = features.loc[small & train, 'mfcc']\n",
330 |     "X_test = features.loc[small & test, 'mfcc']\n",
331 |     "\n",
332 |     "print('{} training examples, {} testing examples'.format(y_train.size, y_test.size))\n",
333 |     "print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "# Be sure training samples are shuffled.\n",
343 |     "X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)\n",
344 |     "\n",
345 |     "# Standardize features by removing the mean and scaling to unit variance.\n",
346 |     "scaler = skl.preprocessing.StandardScaler(copy=False)\n",
347 |     "scaler.fit_transform(X_train)\n",
348 |     "scaler.transform(X_test)\n",
349 |     "\n",
350 |     "# Support vector classification.\n",
351 |     "clf = skl.svm.SVC()\n",
352 |     "clf.fit(X_train, y_train)\n",
353 |     "score = clf.score(X_test, y_test)\n",
354 |     "print('Accuracy: {:.2%}'.format(score))"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "markdown",
359 |    "metadata": {},
360 |    "source": [
361 |     "### 5.2 From audio"
362 |    ]
363 |   }
364 |  ],
365 |  "metadata": {},
366 |  "nbformat": 4,
367 |  "nbformat_minor": 2
368 | }
369 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import dotenv
  2 | import pydot
  3 | import requests
  4 | import numpy as np
  5 | import pandas as pd
  6 | import ctypes
  7 | import shutil
  8 | import multiprocessing
  9 | import multiprocessing.sharedctypes as sharedctypes
 10 | import os.path
 11 | import ast
 12 | 
 13 | 
 14 | # Number of samples per 30s audio clip.
 15 | # TODO: fix dataset to be constant.
 16 | NB_AUDIO_SAMPLES = 1321967
 17 | SAMPLING_RATE = 44100
 18 | 
 19 | # Load the environment from the .env file.
 20 | dotenv.load_dotenv(dotenv.find_dotenv())
 21 | 
 22 | 
 23 | class FreeMusicArchive:
 24 | 
 25 |     BASE_URL = 'https://freemusicarchive.org/api/get/'
 26 | 
 27 |     def __init__(self, api_key):
 28 |         self.api_key = api_key
 29 | 
 30 |     def get_recent_tracks(self):
 31 |         URL = 'https://freemusicarchive.org/recent.json'
 32 |         r = requests.get(URL)
 33 |         r.raise_for_status()
 34 |         tracks = []
 35 |         artists = []
 36 |         date_created = []
 37 |         for track in r.json()['aTracks']:
 38 |             tracks.append(track['track_id'])
 39 |             artists.append(track['artist_name'])
 40 |             date_created.append(track['track_date_created'])
 41 |         return tracks, artists, date_created
 42 | 
 43 |     def _get_data(self, dataset, fma_id, fields=None):
 44 |         url = self.BASE_URL + dataset + 's.json?'
 45 |         url += dataset + '_id=' + str(fma_id) + '&api_key=' + self.api_key
 46 |         # print(url)
 47 |         r = requests.get(url)
 48 |         r.raise_for_status()
 49 |         if r.json()['errors']:
 50 |             raise Exception(r.json()['errors'])
 51 |         data = r.json()['dataset'][0]
 52 |         r_id = data[dataset + '_id']
 53 |         if r_id != str(fma_id):
 54 |             raise Exception('The received id {} does not correspond to'
 55 |                             'the requested one {}'.format(r_id, fma_id))
 56 |         if fields is None:
 57 |             return data
 58 |         if type(fields) is list:
 59 |             ret = {}
 60 |             for field in fields:
 61 |                 ret[field] = data[field]
 62 |             return ret
 63 |         else:
 64 |             return data[fields]
 65 | 
 66 |     def get_track(self, track_id, fields=None):
 67 |         return self._get_data('track', track_id, fields)
 68 | 
 69 |     def get_album(self, album_id, fields=None):
 70 |         return self._get_data('album', album_id, fields)
 71 | 
 72 |     def get_artist(self, artist_id, fields=None):
 73 |         return self._get_data('artist', artist_id, fields)
 74 | 
 75 |     def get_all(self, dataset, id_range):
 76 |         index = dataset + '_id'
 77 | 
 78 |         id_ = 2 if dataset == 'track' else 1
 79 |         row = self._get_data(dataset, id_)
 80 |         df = pd.DataFrame(columns=row.keys())
 81 |         df.set_index(index, inplace=True)
 82 | 
 83 |         not_found_ids = []
 84 | 
 85 |         for id_ in id_range:
 86 |             try:
 87 |                 row = self._get_data(dataset, id_)
 88 |             except:
 89 |                 not_found_ids.append(id_)
 90 |                 continue
 91 |             row.pop(index)
 92 |             df = df.append(pd.Series(row, name=id_))
 93 | 
 94 |         return df, not_found_ids
 95 | 
 96 |     def download_track(self, track_file, path):
 97 |         url = 'https://files.freemusicarchive.org/' + track_file
 98 |         r = requests.get(url, stream=True)
 99 |         r.raise_for_status()
100 |         with open(path, 'wb') as f:
101 |             shutil.copyfileobj(r.raw, f)
102 | 
103 |     def get_track_genres(self, track_id):
104 |         genres = self.get_track(track_id, 'track_genres')
105 |         genre_ids = []
106 |         genre_titles = []
107 |         for genre in genres:
108 |             genre_ids.append(genre['genre_id'])
109 |             genre_titles.append(genre['genre_title'])
110 |         return genre_ids, genre_titles
111 | 
112 |     def get_all_genres(self):
113 |         df = pd.DataFrame(columns=['genre_parent_id', 'genre_title',
114 |                                    'genre_handle', 'genre_color'])
115 |         df.index.rename('genre_id', inplace=True)
116 | 
117 |         page = 1
118 |         while True:
119 |             url = self.BASE_URL + 'genres.json?limit=50'
120 |             url += '&page={}&api_key={}'.format(page, self.api_key)
121 |             r = requests.get(url)
122 |             for genre in r.json()['dataset']:
123 |                 genre_id = int(genre.pop(df.index.name))
124 |                 df.loc[genre_id] = genre
125 |             assert (r.json()['page'] == str(page))
126 |             page += 1
127 |             if page > r.json()['total_pages']:
128 |                 break
129 | 
130 |         return df
131 | 
132 | 
133 | class Genres:
134 | 
135 |     def __init__(self, genres_df):
136 |         self.df = genres_df
137 | 
138 |     def create_tree(self, roots, depth=None):
139 | 
140 |         if type(roots) is not list:
141 |             roots = [roots]
142 |         graph = pydot.Dot(graph_type='digraph', strict=True)
143 | 
144 |         def create_node(genre_id):
145 |             title = self.df.at[genre_id, 'title']
146 |             ntracks = self.df.at[genre_id, '#tracks']
147 |             # name = self.df.at[genre_id, 'title'] + '\n' + str(genre_id)
148 |             name = '"{}\n{} / {}"'.format(title, genre_id, ntracks)
149 |             return pydot.Node(name)
150 | 
151 |         def create_tree(root_id, node_p, depth):
152 |             if depth == 0:
153 |                 return
154 |             children = self.df[self.df['parent'] == root_id]
155 |             for child in children.iterrows():
156 |                 genre_id = child[0]
157 |                 node_c = create_node(genre_id)
158 |                 graph.add_edge(pydot.Edge(node_p, node_c))
159 |                 create_tree(genre_id, node_c,
160 |                             depth-1 if depth is not None else None)
161 | 
162 |         for root in roots:
163 |             node_p = create_node(root)
164 |             graph.add_node(node_p)
165 |             create_tree(root, node_p, depth)
166 | 
167 |         return graph
168 | 
169 |     def find_roots(self):
170 |         roots = []
171 |         for gid, row in self.df.iterrows():
172 |             parent = row['parent']
173 |             title = row['title']
174 |             if parent == 0:
175 |                 roots.append(gid)
176 |             elif parent not in self.df.index:
177 |                 msg = '{} ({}) has parent {} which is missing'.format(
178 |                         gid, title, parent)
179 |                 raise RuntimeError(msg)
180 |         return roots
181 | 
182 | 
183 | def load(filepath):
184 | 
185 |     filename = os.path.basename(filepath)
186 | 
187 |     if 'features' in filename:
188 |         return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
189 | 
190 |     if 'echonest' in filename:
191 |         return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
192 | 
193 |     if 'genres' in filename:
194 |         return pd.read_csv(filepath, index_col=0)
195 | 
196 |     if 'tracks' in filename:
197 |         tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])
198 | 
199 |         COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
200 |                    ('track', 'genres'), ('track', 'genres_all')]
201 |         for column in COLUMNS:
202 |             tracks[column] = tracks[column].map(ast.literal_eval)
203 | 
204 |         COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
205 |                    ('album', 'date_created'), ('album', 'date_released'),
206 |                    ('artist', 'date_created'), ('artist', 'active_year_begin'),
207 |                    ('artist', 'active_year_end')]
208 |         for column in COLUMNS:
209 |             tracks[column] = pd.to_datetime(tracks[column])
210 | 
211 |         SUBSETS = ('small', 'medium', 'large')
212 |         try:
213 |             tracks['set', 'subset'] = tracks['set', 'subset'].astype(
214 |                     'category', categories=SUBSETS, ordered=True)
215 |         except (ValueError, TypeError):
216 |             # the categories and ordered arguments were removed in pandas 0.25
217 |             tracks['set', 'subset'] = tracks['set', 'subset'].astype(
218 |                      pd.CategoricalDtype(categories=SUBSETS, ordered=True))
219 | 
220 |         COLUMNS = [('track', 'genre_top'), ('track', 'license'),
221 |                    ('album', 'type'), ('album', 'information'),
222 |                    ('artist', 'bio')]
223 |         for column in COLUMNS:
224 |             tracks[column] = tracks[column].astype('category')
225 | 
226 |         return tracks
227 | 
228 | 
229 | def get_audio_path(audio_dir, track_id):
230 |     """
231 |     Return the path to the mp3 given the directory where the audio is stored
232 |     and the track ID.
233 | 
234 |     Examples
235 |     --------
236 |     >>> import utils
237 |     >>> AUDIO_DIR = os.environ.get('AUDIO_DIR')
238 |     >>> utils.get_audio_path(AUDIO_DIR, 2)
239 |     '../data/fma_small/000/000002.mp3'
240 | 
241 |     """
242 |     tid_str = '{:06d}'.format(track_id)
243 |     return os.path.join(audio_dir, tid_str[:3], tid_str + '.mp3')
244 | 
245 | 
246 | class Loader:
247 |     def load(self, filepath):
248 |         raise NotImplementedError()
249 | 
250 | 
251 | class RawAudioLoader(Loader):
252 |     def __init__(self, sampling_rate=SAMPLING_RATE):
253 |         self.sampling_rate = sampling_rate
254 |         self.shape = (NB_AUDIO_SAMPLES * sampling_rate // SAMPLING_RATE, )
255 | 
256 |     def load(self, filepath):
257 |         return self._load(filepath)[:self.shape[0]]
258 | 
259 | 
260 | class LibrosaLoader(RawAudioLoader):
261 |     def _load(self, filepath):
262 |         import librosa
263 |         sr = self.sampling_rate if self.sampling_rate != SAMPLING_RATE else None
264 |         # kaiser_fast is 3x faster than kaiser_best
265 |         # x, sr = librosa.load(filepath, sr=sr, res_type='kaiser_fast')
266 |         x, sr = librosa.load(filepath, sr=sr)
267 |         return x
268 | 
269 | 
270 | class AudioreadLoader(RawAudioLoader):
271 |     def _load(self, filepath):
272 |         import audioread
273 |         a = audioread.audio_open(filepath)
274 |         a.read_data()
275 | 
276 | 
277 | class PydubLoader(RawAudioLoader):
278 |     def _load(self, filepath):
279 |         from pydub import AudioSegment
280 |         song = AudioSegment.from_file(filepath)
281 |         song = song.set_channels(1)
282 |         x = song.get_array_of_samples()
283 |         # print(filepath) if song.channels != 2 else None
284 |         return np.array(x)
285 | 
286 | 
287 | class FfmpegLoader(RawAudioLoader):
288 |     def _load(self, filepath):
289 |         """Fastest and less CPU intensive loading method."""
290 |         import subprocess as sp
291 |         command = ['ffmpeg',
292 |                    '-i', filepath,
293 |                    '-f', 's16le',
294 |                    '-acodec', 'pcm_s16le',
295 |                    '-ac', '1']  # channels: 2 for stereo, 1 for mono
296 |         if self.sampling_rate != SAMPLING_RATE:
297 |             command.extend(['-ar', str(self.sampling_rate)])
298 |         command.append('-')
299 |         # 30s at 44.1 kHz ~= 1.3e6
300 |         proc = sp.run(command, stdout=sp.PIPE, bufsize=10**7, stderr=sp.DEVNULL, check=True)
301 | 
302 |         return np.fromstring(proc.stdout, dtype="int16")
303 | 
304 | 
305 | def build_sample_loader(audio_dir, Y, loader):
306 | 
307 |     class SampleLoader:
308 | 
309 |         def __init__(self, tids, batch_size=4):
310 |             self.lock1 = multiprocessing.Lock()
311 |             self.lock2 = multiprocessing.Lock()
312 |             self.batch_foremost = sharedctypes.RawValue(ctypes.c_int, 0)
313 |             self.batch_rearmost = sharedctypes.RawValue(ctypes.c_int, -1)
314 |             self.condition = multiprocessing.Condition(lock=self.lock2)
315 | 
316 |             data = sharedctypes.RawArray(ctypes.c_int, tids.data)
317 |             self.tids = np.ctypeslib.as_array(data)
318 | 
319 |             self.batch_size = batch_size
320 |             self.loader = loader
321 |             self.X = np.empty((self.batch_size, *loader.shape))
322 |             self.Y = np.empty((self.batch_size, Y.shape[1]), dtype=np.int)
323 | 
324 |         def __iter__(self):
325 |             return self
326 | 
327 |         def __next__(self):
328 | 
329 |             with self.lock1:
330 |                 if self.batch_foremost.value == 0:
331 |                     np.random.shuffle(self.tids)
332 | 
333 |                 batch_current = self.batch_foremost.value
334 |                 if self.batch_foremost.value + self.batch_size < self.tids.size:
335 |                     batch_size = self.batch_size
336 |                     self.batch_foremost.value += self.batch_size
337 |                 else:
338 |                     batch_size = self.tids.size - self.batch_foremost.value
339 |                     self.batch_foremost.value = 0
340 | 
341 |                 # print(self.tids, self.batch_foremost.value, batch_current, self.tids[batch_current], batch_size)
342 |                 # print('queue', self.tids[batch_current], batch_size)
343 |                 tids = np.array(self.tids[batch_current:batch_current+batch_size])
344 | 
345 |             batch_size = 0
346 |             for tid in tids:
347 |                 try:
348 |                     audio_path = get_audio_path(audio_dir, tid)
349 |                     self.X[batch_size] = self.loader.load(audio_path)
350 |                     self.Y[batch_size] = Y.loc[tid]
351 |                     batch_size += 1
352 |                 except Exception as e:
353 |                     print("\nIgnoring " + audio_path +" (error: " + str(e) +").")
354 | 
355 |             with self.lock2:
356 |                 while (batch_current - self.batch_rearmost.value) % self.tids.size > self.batch_size:
357 |                     # print('wait', indices[0], batch_current, self.batch_rearmost.value)
358 |                     self.condition.wait()
359 |                 self.condition.notify_all()
360 |                 # print('yield', indices[0], batch_current, self.batch_rearmost.value)
361 |                 self.batch_rearmost.value = batch_current
362 | 
363 |                 return self.X[:batch_size], self.Y[:batch_size]
364 | 
365 |     return SampleLoader
366 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FMA: A Dataset For Music Analysis
  2 | 
  3 | [Michaël Defferrard](https://deff.ch),
  4 | [Kirell Benzi](https://kirellbenzi.com),
  5 | [Pierre Vandergheynst](https://people.epfl.ch/pierre.vandergheynst),
  6 | [Xavier Bresson](https://www.ntu.edu.sg/home/xbresson). \
  7 | International Society for Music Information Retrieval Conference (ISMIR), 2017.
  8 | 
  9 | > We introduce the Free Music Archive (FMA), an open and easily accessible
 10 | > dataset suitable for evaluating several tasks in MIR, a field concerned with
 11 | > browsing, searching, and organizing large music collections. The community's
 12 | > growing interest in feature and end-to-end learning is however restrained by
 13 | > the limited availability of large audio datasets. The FMA aims to overcome
 14 | > this hurdle by providing 917 GiB and 343 days of Creative Commons-licensed
 15 | > audio from 106,574 tracks from 16,341 artists and 14,854 albums, arranged in
 16 | > a hierarchical taxonomy of 161 genres. It provides full-length and
 17 | > high-quality audio, pre-computed features, together with track- and
 18 | > user-level metadata, tags, and free-form text such as biographies. We here
 19 | > describe the dataset and how it was created, propose a train/validation/test
 20 | > split and three subsets, discuss some suitable MIR tasks, and evaluate some
 21 | > baselines for genre recognition. Code, data, and usage examples are available
 22 | > at <https://github.com/mdeff/fma>.
 23 | 
 24 | * Paper: [`arXiv:1612.01840`][paper] ([latex and reviews](https://github.com/mdeff/paper-fma-ismir2017))
 25 | * Slides: [`doi:10.5281/zenodo.1066119`](https://doi.org/10.5281/zenodo.1066119)
 26 | * Poster: [`doi:10.5281/zenodo.1035847`](https://doi.org/10.5281/zenodo.1035847)
 27 | 
 28 | [paper]: https://arxiv.org/abs/1612.01840
 29 | [FMA]: https://freemusicarchive.org
 30 | 
 31 | ## Data
 32 | 
 33 | All metadata and features for all tracks are distributed in **[`fma_metadata.zip`]** (342 MiB).
 34 | The below tables can be used with [pandas] or any other data analysis tool.
 35 | See the [paper] or the [`usage.ipynb`] notebook for a description.
 36 | * `tracks.csv`: per track metadata such as ID, title, artist, genres, tags and play counts, for all 106,574 tracks.
 37 | * `genres.csv`: all 163 genres with name and parent (used to infer the genre hierarchy and top-level genres).
 38 | * `features.csv`: common features extracted with [librosa].
 39 | * `echonest.csv`: audio features provided by [Echonest] (now [Spotify]) for a subset of 13,129 tracks.
 40 | 
 41 | [pandas]:   https://pandas.pydata.org/
 42 | [librosa]:  https://librosa.org/
 43 | [spotify]:  https://www.spotify.com/
 44 | [echonest]: https://web.archive.org/web/20170519050040/http://the.echonest.com/
 45 | 
 46 | Then, you got various sizes of MP3-encoded audio data:
 47 | 
 48 | 1. **[`fma_small.zip`]**: 8,000 tracks of 30s, 8 balanced genres (GTZAN-like) (7.2 GiB)
 49 | 2. **[`fma_medium.zip`]**: 25,000 tracks of 30s, 16 unbalanced genres (22 GiB)
 50 | 3. **[`fma_large.zip`]**: 106,574 tracks of 30s, 161 unbalanced genres (93 GiB)
 51 | 4. **[`fma_full.zip`]**: 106,574 untrimmed tracks, 161 unbalanced genres (879 GiB)
 52 | 
 53 | [`fma_metadata.zip`]: https://os.unil.cloud.switch.ch/fma/fma_metadata.zip
 54 | [`fma_small.zip`]:    https://os.unil.cloud.switch.ch/fma/fma_small.zip
 55 | [`fma_medium.zip`]:   https://os.unil.cloud.switch.ch/fma/fma_medium.zip
 56 | [`fma_large.zip`]:    https://os.unil.cloud.switch.ch/fma/fma_large.zip
 57 | [`fma_full.zip`]:     https://os.unil.cloud.switch.ch/fma/fma_full.zip
 58 | 
 59 | See the [wiki](https://github.com/mdeff/fma/wiki) (or [#41](https://github.com/mdeff/fma/issues/41)) for **known issues (errata)**.
 60 | 
 61 | ## Code
 62 | 
 63 | The following notebooks, scripts, and modules have been developed for the dataset.
 64 | 
 65 | 1. [`usage.ipynb`]: shows how to load the datasets and develop, train, and test your own models with it.
 66 | 2. [`analysis.ipynb`]: exploration of the metadata, data, and features.
 67 |    Creates the [figures](https://github.com/mdeff/fma/tree/outputs/figures) used in the paper.
 68 | 3. [`baselines.ipynb`]: baseline models for genre recognition, both from audio and features.
 69 | 4. [`features.py`]: features extraction from the audio (used to create `features.csv`).
 70 | 5. [`webapi.ipynb`]: query the web API of the [FMA]. Can be used to update the dataset.
 71 | 6. [`creation.ipynb`]: creation of the dataset (used to create `tracks.csv` and `genres.csv`).
 72 | 7. [`creation.py`]: creation of the dataset (long-running data collection and processing).
 73 | 8. [`utils.py`]: helper functions and classes.
 74 | 
 75 | [`usage.ipynb`]:     https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/usage.ipynb
 76 | [`analysis.ipynb`]:  https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/analysis.ipynb
 77 | [`baselines.ipynb`]: https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/baselines.ipynb
 78 | [`features.py`]:     features.py
 79 | [`webapi.ipynb`]:    https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/webapi.ipynb
 80 | [`creation.ipynb`]:  https://nbviewer.jupyter.org/github/mdeff/fma/blob/outputs/creation.ipynb
 81 | [`creation.py`]:     creation.py
 82 | [`utils.py`]:        utils.py
 83 | 
 84 | ## Usage
 85 | 
 86 | [![Binder](https://static.mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/mdeff/fma/outputs?urlpath=lab/tree/usage.ipynb)
 87 | &nbsp; Click the binder badge to play with the code and data from your browser without installing anything.
 88 | 
 89 | 1. Clone the repository.
 90 |     ```sh
 91 |     git clone https://github.com/mdeff/fma.git
 92 |     cd fma
 93 |     ```
 94 | 
 95 | 1. <details><summary>Create a Python 3.6 environment.</summary>
 96 | 
 97 |     ```sh
 98 |     # with https://conda.io
 99 |     conda create -n fma python=3.6
100 |     conda activate fma
101 | 
102 |     # with https://github.com/pyenv/pyenv
103 |     pyenv install 3.6.0
104 |     pyenv virtualenv 3.6.0 fma
105 |     pyenv activate fma
106 | 
107 |     # with https://pipenv.pypa.io
108 |     pipenv --python 3.6
109 |     pipenv shell
110 | 
111 |     # with https://docs.python.org/3/tutorial/venv.html
112 |     python3.6 -m venv ./env
113 |     source ./env/bin/activate
114 |     ```
115 |     </details>
116 | 
117 | 1. Install dependencies.
118 |     ```sh
119 |     pip install --upgrade pip setuptools wheel
120 |     pip install numpy==1.12.1  # workaround resampy's bogus setup.py
121 |     pip install -r requirements.txt
122 |     ```
123 |     Note: you may need to install [ffmpeg](https://ffmpeg.org/download.html) or [graphviz](https://www.graphviz.org) depending on your usage.\
124 |     Note: install [CUDA](https://en.wikipedia.org/wiki/CUDA) to train neural networks on GPUs (see [Tensorflow's instructions](https://www.tensorflow.org/install/)).
125 | 
126 | 1. Download some data, verify its integrity, and uncompress the archives.
127 |     ```sh
128 |     cd data
129 | 
130 |     curl -O https://os.unil.cloud.switch.ch/fma/fma_metadata.zip
131 |     curl -O https://os.unil.cloud.switch.ch/fma/fma_small.zip
132 |     curl -O https://os.unil.cloud.switch.ch/fma/fma_medium.zip
133 |     curl -O https://os.unil.cloud.switch.ch/fma/fma_large.zip
134 |     curl -O https://os.unil.cloud.switch.ch/fma/fma_full.zip
135 | 
136 |     echo "f0df49ffe5f2a6008d7dc83c6915b31835dfe733  fma_metadata.zip" | sha1sum -c -
137 |     echo "ade154f733639d52e35e32f5593efe5be76c6d70  fma_small.zip"    | sha1sum -c -
138 |     echo "c67b69ea232021025fca9231fc1c7c1a063ab50b  fma_medium.zip"   | sha1sum -c -
139 |     echo "497109f4dd721066b5ce5e5f250ec604dc78939e  fma_large.zip"    | sha1sum -c -
140 |     echo "0f0ace23fbe9ba30ecb7e95f763e435ea802b8ab  fma_full.zip"     | sha1sum -c -
141 | 
142 |     unzip fma_metadata.zip
143 |     unzip fma_small.zip
144 |     unzip fma_medium.zip
145 |     unzip fma_large.zip
146 |     unzip fma_full.zip
147 | 
148 |     cd ..
149 |     ```
150 | 
151 |     Note: try [7zip](https://www.7-zip.org) if decompression errors.
152 |     It might be an [unsupported compression issue](https://github.com/mdeff/fma/issues/5).
153 | 
154 | 1. Fill a `.env` configuration file (at repository's root) with the following content.
155 |     ```
156 |     AUDIO_DIR=./data/fma_small/  # the path to a decompressed fma_*.zip
157 |     FMA_KEY=MYKEY  # only if you want to query the freemusicarchive.org API
158 |     ```
159 | 
160 | 1. Open Jupyter or run a notebook.
161 |     ```sh
162 |     jupyter notebook
163 |     make usage.ipynb
164 |     ```
165 | 
166 | ## Impact, coverage, and resources
167 | 
168 | <details><summary>100+ research papers</summary>
169 | 
170 | Full list on [Google Scholar](https://scholar.google.com/scholar?cites=13646959466952873682,13785796238335741238,7544459641098681164,5736399534855095976).
171 | Some picks below.
172 | 
173 | * [Zero-shot Learning for Audio-based Music Classification and Tagging](https://arxiv.org/abs/1907.02670)
174 | * [One deep music representation to rule them all? A comparative analysis of different representation learning strategies](https://doi.org/10.1007/s00521-019-04076-1)
175 | * [Deep Learning for Audio-Based Music Classification and Tagging: Teaching Computers to Distinguish Rock from Bach](https://sci-hub.tw/10.1109/MSP.2018.2874383)
176 | * [Learning Discrete Structures for Graph Neural Networks](https://arxiv.org/abs/1903.11960)
177 | * [A context encoder for audio inpainting](https://arxiv.org/abs/1810.12138)
178 | * [OpenMIC-2018: An Open Data-set for Multiple Instrument Recognition](https://archives.ismir.net/ismir2018/paper/000248.pdf)
179 | * [Detecting Music Genre Using Extreme Gradient Boosting](https://doi.org/10.1145/3184558.3191822)
180 | * [Transfer Learning of Artist Group Factors to Musical Genre Classification](https://doi.org/10.1145/3184558.3191823)
181 | * [Learning to Recognize Musical Genre from Audio: Challenge Overview](https://arxiv.org/abs/1803.05337)
182 | * [Representation Learning of Music Using Artist Labels](https://arxiv.org/abs/1710.06648)
183 | 
184 | </details>
185 | 
186 | <details><summary>2 derived works</summary>
187 | 
188 | * [OpenMIC-2018: An Open Data-set for Multiple Instrument Recognition](https://github.com/cosmir/openmic-2018)
189 | * [ConvNet features](https://github.com/keunwoochoi/FMA_convnet_features) from [Transfer learning for music classification and regression tasks](https://arxiv.org/abs/1703.09179)
190 | 
191 | </details>
192 | 
193 | <details><summary>~10 posts</summary>
194 | 
195 | * [Music Genre Classification With TensorFlow](https://towardsdatascience.com/music-genre-classification-with-tensorflow-3de38f0d4dbb), Towards Data Science, 2020-08-11.
196 | * [Music Genre Classification: Transformers vs Recurrent Neural Networks](https://towardsdatascience.com/music-genre-classification-transformers-vs-recurrent-neural-networks-631751a71c58), Towards Data Science, 2020-06-14.
197 | * [Using CNNs and RNNs for Music Genre Recognition](https://towardsdatascience.com/using-cnns-and-rnns-for-music-genre-recognition-2435fb2ed6af), Towards Data Science, 2018-12-13.
198 | * [Over 1.5 TB’s of Labeled Audio Datasets](https://towardsdatascience.com/a-data-lakes-worth-of-audio-datasets-b45b88cd4ad), Towards Data Science, 2018-11-13.
199 | * [Discovering Descriptive Music Genres Using K-Means Clustering](https://medium.com/latinxinai/discovering-descriptive-music-genres-using-k-means-clustering-d19bdea5e443), Medium, 2018-04-09.
200 | * [25 Open Datasets for Deep Learning Every Data Scientist Must Work With](https://www.analyticsvidhya.com/blog/2018/03/comprehensive-collection-deep-learning-datasets/), Analytics Vidhya, 2018-03-29.
201 | * [Learning Music Genres](https://medium.com/@diegoagher/learning-music-genres-5ab1cabadfed), Medium, 2017-12-13.
202 | * [music2vec: Generating Vector Embeddings for Genre-Classification Task](https://medium.com/@rajatheb/music2vec-generating-vector-embedding-for-genre-classification-task-411187a20820), Medium, 2017-11-28.
203 | * [A Music Information Retrieval Dataset, Made With FMA](https://web.archive.org/web/20190907182116/http://freemusicarchive.org/member/cheyenne_h/blog/A_Music_Information_Retrieval_Dataset_Made_With_FMA), freemusicarchive.org, 2017-05-22.
204 | * [Pre-publication release announced](https://twitter.com/m_deff/status/861985446116589569), twitter.com, 2017-05-09.
205 | * [FMA: A Dataset For Music Analysis](https://tensorflow.blog/2017/03/14/fma-a-dataset-for-music-analysis), tensorflow.blog, 2017-03-14.
206 | * [Beta release discussed](https://twitter.com/YadFaeq/status/829406463286063104), twitter.com, 2017-02-08.
207 | * [FMA Data Set for Researchers Released](https://web.archive.org/web/20190826112752/http://freemusicarchive.org/member/cheyenne_h/blog/FMA_Dataset_for_Researchers), freemusicarchive.org, 2016-12-15.
208 | 
209 | </details>
210 | 
211 | <details><summary>5 events</summary>
212 | 
213 | * [Summer Workshop](https://hcdigitalscholarship.github.io/audio-files) by the [Haverford Digital Scholarship Library](https://www.haverford.edu/library/digital-scholarship), 2020-07.
214 | * [Genre recognition challenge](https://www.crowdai.org/challenges/www-2018-challenge-learning-to-recognize-musical-genre) at the [Web Conference](https://www2018.thewebconf.org/program/challenges-track/), Lyon, 2018-04.
215 | * [Slides](https://doi.org/10.5281/zenodo.1066119) presented at the [Data Jam days](http://datajamdays.org), Lausanne, 2017-11-24.
216 | * [Poster](https://doi.org/10.5281/zenodo.1035847) presented at [ISMIR 2017](https://ismir2017.ismir.net), Suzhou, 2017-10-24.
217 | * [Slides](https://doi.org/10.5281/zenodo.999353) for the [Open Science in Practice](https://osip2017.epfl.ch) summer school at EPFL, 2017-09-29.
218 | 
219 | </details>
220 | 
221 | <details><summary>~10 dataset lists</summary>
222 | 
223 | * <https://github.com/caesar0301/awesome-public-datasets>
224 | * <https://archive.ics.uci.edu/ml/datasets/FMA:+A+Dataset+For+Music+Analysis>
225 | * <http://deeplearning.net/datasets>
226 | * <http://www.audiocontentanalysis.org/data-sets>
227 | * <https://github.com/ismir/mir-datasets>
228 | * <https://teachingmir.wikispaces.com/Datasets>
229 | * <https://en.wikipedia.org/wiki/List_of_datasets_for_machine_learning_research>
230 | * <https://loc.gov/item/2018655052>
231 | * <https://cloudlab.atlassian.net/wiki/display/datasets/FMA:+A+Dataset+For+Music+Analysis>
232 | * <https://www.datasetlist.com>
233 | * <https://data-flair.training/blogs/deep-learning-project-ideas>
234 | 
235 | </details>
236 | 
237 | ## Contributing
238 | 
239 | Contribute by opening an [issue](https://github.com/mdeff/fma/issues) or a [pull request](https://github.com/mdeff/fma/pulls).
240 | Let this repository be a hub around the dataset!
241 | 
242 | ## History
243 | 
244 | **2017-05-09 pre-publication release**
245 | * paper: [arXiv:1612.01840v2](https://arxiv.org/abs/1612.01840v2)
246 | * code: [git tag rc1](https://github.com/mdeff/fma/releases/tag/rc1)
247 | * `fma_metadata.zip` sha1: `f0df49ffe5f2a6008d7dc83c6915b31835dfe733`
248 | * `fma_small.zip`    sha1: `ade154f733639d52e35e32f5593efe5be76c6d70`
249 | * `fma_medium.zip`   sha1: `c67b69ea232021025fca9231fc1c7c1a063ab50b`
250 | * `fma_large.zip`    sha1: `497109f4dd721066b5ce5e5f250ec604dc78939e`
251 | * `fma_full.zip`     sha1: `0f0ace23fbe9ba30ecb7e95f763e435ea802b8ab`
252 | * known issues: see [#41](https://github.com/mdeff/fma/issues/41)
253 | 
254 | **2016-12-06 beta release**
255 | * paper: [arXiv:1612.01840v1](https://arxiv.org/abs/1612.01840v1)
256 | * code: [git tag beta](https://github.com/mdeff/fma/releases/tag/beta)
257 | * `fma_small.zip`  sha1: `e731a5d56a5625f7b7f770923ee32922374e2cbf`
258 | * `fma_medium.zip` sha1: `fe23d6f2a400821ed1271ded6bcd530b7a8ea551`
259 | 
260 | ## Acknowledgments and Licenses
261 | 
262 | We are grateful to the [Swiss Data Science Center] ([EPFL] and [ETHZ]) for hosting the dataset.
263 | 
264 | Please cite our work if you use our code or data.
265 | 
266 | ```
267 | @inproceedings{fma_dataset,
268 |   title = {{FMA}: A Dataset for Music Analysis},
269 |   author = {Defferrard, Micha\"el and Benzi, Kirell and Vandergheynst, Pierre and Bresson, Xavier},
270 |   booktitle = {18th International Society for Music Information Retrieval Conference (ISMIR)},
271 |   year = {2017},
272 |   archiveprefix = {arXiv},
273 |   eprint = {1612.01840},
274 |   url = {https://arxiv.org/abs/1612.01840},
275 | }
276 | ```
277 | 
278 | ```
279 | @inproceedings{fma_challenge,
280 |   title = {Learning to Recognize Musical Genre from Audio},
281 |   subtitle = {Challenge Overview},
282 |   author = {Defferrard, Micha\"el and Mohanty, Sharada P. and Carroll, Sean F. and Salath\'e, Marcel},
283 |   booktitle = {The 2018 Web Conference Companion},
284 |   year = {2018},
285 |   publisher = {ACM Press},
286 |   isbn = {9781450356404},
287 |   doi = {10.1145/3184558.3192310},
288 |   archiveprefix = {arXiv},
289 |   eprint = {1803.05337},
290 |   url = {https://arxiv.org/abs/1803.05337},
291 | }
292 | ```
293 | 
294 | * The code in this repository is released under the [MIT license](LICENSE.txt).
295 | * The metadata is released under the [Creative Commons Attribution 4.0 International License (CC BY 4.0)][ccby40].
296 | * We do not hold the copyright on the audio and distribute it under the license chosen by the artist.
297 | * The dataset is meant for research purposes.
298 | 
299 | [ccby40]: https://creativecommons.org/licenses/by/4.0
300 | [Swiss Data Science Center]: https://datascience.ch/collaboration-and-partnerships
301 | [EPFL]: https://www.epfl.ch
302 | [ETHZ]: https://www.ethz.ch
303 | 


--------------------------------------------------------------------------------
/baselines.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n",
  8 |     "\n",
  9 |     "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n",
 10 |     "\n",
 11 |     "## Baselines\n",
 12 |     "\n",
 13 |     "* This notebook evaluates standard classifiers from scikit-learn on the provided features.\n",
 14 |     "* Moreover, it evaluates Deep Learning models on both audio and spectrograms."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import time\n",
 24 |     "import os\n",
 25 |     "\n",
 26 |     "import IPython.display as ipd\n",
 27 |     "from tqdm import tqdm_notebook\n",
 28 |     "import numpy as np\n",
 29 |     "import pandas as pd\n",
 30 |     "import keras\n",
 31 |     "from keras.layers import Activation, Dense, Conv1D, Conv2D, MaxPooling1D, Flatten, Reshape\n",
 32 |     "\n",
 33 |     "from sklearn.utils import shuffle\n",
 34 |     "from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler\n",
 35 |     "from sklearn.linear_model import LogisticRegression\n",
 36 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 37 |     "from sklearn.svm import SVC, LinearSVC\n",
 38 |     "#from sklearn.gaussian_process import GaussianProcessClassifier\n",
 39 |     "#from sklearn.gaussian_process.kernels import RBF\n",
 40 |     "from sklearn.tree import DecisionTreeClassifier\n",
 41 |     "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n",
 42 |     "from sklearn.neural_network import MLPClassifier\n",
 43 |     "from sklearn.naive_bayes import GaussianNB\n",
 44 |     "from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis\n",
 45 |     "from sklearn.multiclass import OneVsRestClassifier\n",
 46 |     "\n",
 47 |     "import utils"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "AUDIO_DIR = os.environ.get('AUDIO_DIR')\n",
 57 |     "\n",
 58 |     "tracks = utils.load('data/fma_metadata/tracks.csv')\n",
 59 |     "features = utils.load('data/fma_metadata/features.csv')\n",
 60 |     "echonest = utils.load('data/fma_metadata/echonest.csv')\n",
 61 |     "\n",
 62 |     "np.testing.assert_array_equal(features.index, tracks.index)\n",
 63 |     "assert echonest.index.isin(tracks.index).all()\n",
 64 |     "\n",
 65 |     "tracks.shape, features.shape, echonest.shape"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "## Subset"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "subset = tracks.index[tracks['set', 'subset'] <= 'medium']\n",
 82 |     "\n",
 83 |     "assert subset.isin(tracks.index).all()\n",
 84 |     "assert subset.isin(features.index).all()\n",
 85 |     "\n",
 86 |     "features_all = features.join(echonest, how='inner').sort_index(axis=1)\n",
 87 |     "print('Not enough Echonest features: {}'.format(features_all.shape))\n",
 88 |     "\n",
 89 |     "tracks = tracks.loc[subset]\n",
 90 |     "features_all = features.loc[subset]\n",
 91 |     "\n",
 92 |     "tracks.shape, features_all.shape"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "train = tracks.index[tracks['set', 'split'] == 'training']\n",
102 |     "val = tracks.index[tracks['set', 'split'] == 'validation']\n",
103 |     "test = tracks.index[tracks['set', 'split'] == 'test']\n",
104 |     "\n",
105 |     "print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))\n",
106 |     "\n",
107 |     "genres = list(LabelEncoder().fit(tracks['track', 'genre_top']).classes_)\n",
108 |     "#genres = list(tracks['track', 'genre_top'].unique())\n",
109 |     "print('Top genres ({}): {}'.format(len(genres), genres))\n",
110 |     "genres = list(MultiLabelBinarizer().fit(tracks['track', 'genres_all']).classes_)\n",
111 |     "print('All genres ({}): {}'.format(len(genres), genres))"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "## 1 Multiple classifiers and feature sets\n",
119 |     "\n",
120 |     "Todo:\n",
121 |     "* Cross-validation for hyper-parameters.\n",
122 |     "* Dimensionality reduction?"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "### 1.1 Pre-processing"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "def pre_process(tracks, features, columns, multi_label=False, verbose=False):\n",
139 |     "    if not multi_label:\n",
140 |     "        # Assign an integer value to each genre.\n",
141 |     "        enc = LabelEncoder()\n",
142 |     "        labels = tracks['track', 'genre_top']\n",
143 |     "        #y = enc.fit_transform(tracks['track', 'genre_top'])\n",
144 |     "    else:\n",
145 |     "        # Create an indicator matrix.\n",
146 |     "        enc = MultiLabelBinarizer()\n",
147 |     "        labels = tracks['track', 'genres_all']\n",
148 |     "        #labels = tracks['track', 'genres']\n",
149 |     "\n",
150 |     "    # Split in training, validation and testing sets.\n",
151 |     "    y_train = enc.fit_transform(labels[train])\n",
152 |     "    y_val = enc.transform(labels[val])\n",
153 |     "    y_test = enc.transform(labels[test])\n",
154 |     "    X_train = features.loc[train, columns].as_matrix()\n",
155 |     "    X_val = features.loc[val, columns].as_matrix()\n",
156 |     "    X_test = features.loc[test, columns].as_matrix()\n",
157 |     "    \n",
158 |     "    X_train, y_train = shuffle(X_train, y_train, random_state=42)\n",
159 |     "    \n",
160 |     "    # Standardize features by removing the mean and scaling to unit variance.\n",
161 |     "    scaler = StandardScaler(copy=False)\n",
162 |     "    scaler.fit_transform(X_train)\n",
163 |     "    scaler.transform(X_val)\n",
164 |     "    scaler.transform(X_test)\n",
165 |     "    \n",
166 |     "    return y_train, y_val, y_test, X_train, X_val, X_test"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "### 1.2 Single genre"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "def test_classifiers_features(classifiers, feature_sets, multi_label=False):\n",
183 |     "    columns = list(classifiers.keys()).insert(0, 'dim')\n",
184 |     "    scores = pd.DataFrame(columns=columns, index=feature_sets.keys())\n",
185 |     "    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())\n",
186 |     "    for fset_name, fset in tqdm_notebook(feature_sets.items(), desc='features'):\n",
187 |     "        y_train, y_val, y_test, X_train, X_val, X_test = pre_process(tracks, features_all, fset, multi_label)\n",
188 |     "        scores.loc[fset_name, 'dim'] = X_train.shape[1]\n",
189 |     "        for clf_name, clf in classifiers.items():  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):\n",
190 |     "            t = time.process_time()\n",
191 |     "            clf.fit(X_train, y_train)\n",
192 |     "            score = clf.score(X_test, y_test)\n",
193 |     "            scores.loc[fset_name, clf_name] = score\n",
194 |     "            times.loc[fset_name, clf_name] = time.process_time() - t\n",
195 |     "    return scores, times\n",
196 |     "\n",
197 |     "def format_scores(scores):\n",
198 |     "    def highlight(s):\n",
199 |     "        is_max = s == max(s[1:])\n",
200 |     "        return ['background-color: yellow' if v else '' for v in is_max]\n",
201 |     "    scores = scores.style.apply(highlight, axis=1)\n",
202 |     "    return scores.format('{:.2%}', subset=pd.IndexSlice[:, scores.columns[1]:])"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "classifiers = {\n",
212 |     "    'LR': LogisticRegression(),\n",
213 |     "    'kNN': KNeighborsClassifier(n_neighbors=200),\n",
214 |     "    'SVCrbf': SVC(kernel='rbf'),\n",
215 |     "    'SVCpoly1': SVC(kernel='poly', degree=1),\n",
216 |     "    'linSVC1': SVC(kernel=\"linear\"),\n",
217 |     "    'linSVC2': LinearSVC(),\n",
218 |     "    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),\n",
219 |     "    'DT': DecisionTreeClassifier(max_depth=5),\n",
220 |     "    'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),\n",
221 |     "    'AdaBoost': AdaBoostClassifier(n_estimators=10),\n",
222 |     "    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),\n",
223 |     "    'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),\n",
224 |     "    'NB': GaussianNB(),\n",
225 |     "    'QDA': QuadraticDiscriminantAnalysis(),\n",
226 |     "}\n",
227 |     "\n",
228 |     "feature_sets = {\n",
229 |     "#    'echonest_audio': ('echonest', 'audio_features'),\n",
230 |     "#    'echonest_social': ('echonest', 'social_features'),\n",
231 |     "#    'echonest_temporal': ('echonest', 'temporal_features'),\n",
232 |     "#    'echonest_audio/social': ('echonest', ('audio_features', 'social_features')),\n",
233 |     "#    'echonest_all': ('echonest', ('audio_features', 'social_features', 'temporal_features')),\n",
234 |     "}\n",
235 |     "for name in features.columns.levels[0]:\n",
236 |     "    feature_sets[name] = name\n",
237 |     "feature_sets.update({\n",
238 |     "    'mfcc/contrast': ['mfcc', 'spectral_contrast'],\n",
239 |     "    'mfcc/contrast/chroma': ['mfcc', 'spectral_contrast', 'chroma_cens'],\n",
240 |     "    'mfcc/contrast/centroid': ['mfcc', 'spectral_contrast', 'spectral_centroid'],\n",
241 |     "    'mfcc/contrast/chroma/centroid': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid'],\n",
242 |     "    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],\n",
243 |     "    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],\n",
244 |     "    'all_non-echonest': list(features.columns.levels[0])\n",
245 |     "})\n",
246 |     "\n",
247 |     "scores, times = test_classifiers_features(classifiers, feature_sets)\n",
248 |     "\n",
249 |     "ipd.display(format_scores(scores))\n",
250 |     "ipd.display(times.style.format('{:.4f}'))"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "### 1.3 Multiple genres\n",
258 |     "\n",
259 |     "Todo:\n",
260 |     "* Ignore rare genres? Count them higher up in the genre tree? On the other hand it's not much tracks."
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "classifiers = {\n",
270 |     "    #LogisticRegression(),\n",
271 |     "    'LR': OneVsRestClassifier(LogisticRegression()),\n",
272 |     "    'SVC': OneVsRestClassifier(SVC()),\n",
273 |     "    'MLP': MLPClassifier(max_iter=700),\n",
274 |     "}\n",
275 |     "\n",
276 |     "feature_sets = {\n",
277 |     "#    'echonest_audio': ('echonest', 'audio_features'),\n",
278 |     "#    'echonest_temporal': ('echonest', 'temporal_features'),\n",
279 |     "    'mfcc': 'mfcc',\n",
280 |     "    'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],\n",
281 |     "    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr'],\n",
282 |     "}\n",
283 |     "\n",
284 |     "scores, times = test_classifiers_features(classifiers, feature_sets, multi_label=True)\n",
285 |     "\n",
286 |     "ipd.display(format_scores(scores))\n",
287 |     "ipd.display(times.style.format('{:.4f}'))"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "## 2 Deep learning on raw audio\n",
295 |     "\n",
296 |     "Other architectures:\n",
297 |     "* [Learning Features of Music from Scratch (MusicNet)](https://arxiv.org/abs/1611.09827), John Thickstun, Zaid Harchaoui, Sham Kakade."
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])\n",
307 |     "labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "Load audio samples in parallel using `multiprocessing` so as to maximize CPU usage when decoding MP3s and making some optional pre-processing. There are multiple ways to load a waveform from a compressed MP3:\n",
315 |     "* librosa uses audioread in the backend which can use many native libraries, e.g. ffmpeg\n",
316 |     "    * resampling is very slow --> use `kaiser_fast`\n",
317 |     "    * does not work with multi-processing, for keras `fit_generator()`\n",
318 |     "* pydub is a high-level interface for audio modification, uses ffmpeg to load\n",
319 |     "    * store a temporary `.wav`\n",
320 |     "* directly pipe ffmpeg output\n",
321 |     "    * fastest method\n",
322 |     "* [pyAV](https://github.com/mikeboers/PyAV) may be a fastest alternative by linking to ffmpeg libraries"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "# Just be sure that everything is fine. Multiprocessing is tricky to debug.\n",
332 |     "utils.FfmpegLoader().load(utils.get_audio_path(AUDIO_DIR, 2))\n",
333 |     "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, utils.FfmpegLoader())\n",
334 |     "SampleLoader(train, batch_size=2).__next__()[0].shape"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": null,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "# Keras parameters.\n",
344 |     "NB_WORKER = len(os.sched_getaffinity(0))  # number of usables CPUs\n",
345 |     "params = {'pickle_safe': True, 'nb_worker': NB_WORKER, 'max_q_size': 10}"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "### 2.1 Fully connected neural network\n",
353 |     "\n",
354 |     "* Two layers with 10 hiddens is no better than random, ~11%.\n",
355 |     "\n",
356 |     "Optimize data loading to be CPU / GPU bound, not IO bound. Larger batches means reduced training time, so increase batch time until memory exhaustion. Number of workers and queue size have no influence on speed."
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "loader = utils.FfmpegLoader(sampling_rate=2000)\n",
366 |     "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n",
367 |     "print('Dimensionality: {}'.format(loader.shape))\n",
368 |     "\n",
369 |     "keras.backend.clear_session()\n",
370 |     "\n",
371 |     "model = keras.models.Sequential()\n",
372 |     "model.add(Dense(output_dim=1000, input_shape=loader.shape))\n",
373 |     "model.add(Activation(\"relu\"))\n",
374 |     "model.add(Dense(output_dim=100))\n",
375 |     "model.add(Activation(\"relu\"))\n",
376 |     "model.add(Dense(output_dim=labels_onehot.shape[1]))\n",
377 |     "model.add(Activation(\"softmax\"))\n",
378 |     "\n",
379 |     "optimizer = keras.optimizers.SGD(lr=0.1, momentum=0.9, nesterov=True)\n",
380 |     "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n",
381 |     "\n",
382 |     "model.fit_generator(SampleLoader(train, batch_size=64), train.size, nb_epoch=2, **params)\n",
383 |     "loss = model.evaluate_generator(SampleLoader(val, batch_size=64), val.size, **params)\n",
384 |     "loss = model.evaluate_generator(SampleLoader(test, batch_size=64), test.size, **params)\n",
385 |     "#Y = model.predict_generator(SampleLoader(test, batch_size=64), test.size, **params);\n",
386 |     "\n",
387 |     "loss"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "### 2.2 Convolutional neural network\n",
395 |     "\n",
396 |     "* Architecture: [End-to-end learning for music audio](http://www.mirlab.org/conference_papers/International_Conference/ICASSP%202014/papers/p7014-dieleman.pdf), Sander Dieleman, Benjamin Schrauwen.\n",
397 |     "* Missing: track segmentation and class averaging (majority voting)\n",
398 |     "* Compared with log-scaled mel-spectrograms instead of strided convolution as first layer.\n",
399 |     "* Larger net: http://benanne.github.io/2014/08/05/spotify-cnns.html"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {},
406 |    "outputs": [],
407 |    "source": [
408 |     "loader = utils.FfmpegLoader(sampling_rate=16000)\n",
409 |     "#loader = utils.LibrosaLoader(sampling_rate=16000)\n",
410 |     "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n",
411 |     "\n",
412 |     "keras.backend.clear_session()\n",
413 |     "\n",
414 |     "model = keras.models.Sequential()\n",
415 |     "model.add(Reshape((-1, 1), input_shape=loader.shape))\n",
416 |     "print(model.output_shape)\n",
417 |     "\n",
418 |     "model.add(Conv1D(128, 512, subsample_length=512))\n",
419 |     "print(model.output_shape)\n",
420 |     "model.add(Activation(\"relu\"))\n",
421 |     "\n",
422 |     "model.add(Conv1D(32, 8))\n",
423 |     "print(model.output_shape)\n",
424 |     "model.add(Activation(\"relu\"))\n",
425 |     "model.add(MaxPooling1D(4))\n",
426 |     "\n",
427 |     "model.add(Conv1D(32, 8))\n",
428 |     "print(model.output_shape)\n",
429 |     "model.add(Activation(\"relu\"))\n",
430 |     "model.add(MaxPooling1D(4))\n",
431 |     "\n",
432 |     "print(model.output_shape)\n",
433 |     "#model.add(Dropout(0.25))\n",
434 |     "model.add(Flatten())\n",
435 |     "print(model.output_shape)\n",
436 |     "model.add(Dense(100))\n",
437 |     "model.add(Activation(\"relu\"))\n",
438 |     "print(model.output_shape)\n",
439 |     "model.add(Dense(labels_onehot.shape[1]))\n",
440 |     "model.add(Activation(\"softmax\"))\n",
441 |     "print(model.output_shape)\n",
442 |     "\n",
443 |     "optimizer = keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True)\n",
444 |     "#optimizer = keras.optimizers.Adam()#lr=1e-5)#, momentum=0.9, nesterov=True)\n",
445 |     "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n",
446 |     "\n",
447 |     "model.fit_generator(SampleLoader(train, batch_size=10), train.size, nb_epoch=20, **params)\n",
448 |     "loss = model.evaluate_generator(SampleLoader(val, batch_size=10), val.size, **params)\n",
449 |     "loss = model.evaluate_generator(SampleLoader(test, batch_size=10), test.size, **params)\n",
450 |     "\n",
451 |     "loss"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "markdown",
456 |    "metadata": {},
457 |    "source": [
458 |     "### 2.3 Recurrent neural network"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "markdown",
463 |    "metadata": {},
464 |    "source": [
465 |     "## 3 Deep learning on extracted audio features\n",
466 |     "\n",
467 |     "Look at:\n",
468 |     "* Pre-processing in Keras: https://github.com/keunwoochoi/kapre\n",
469 |     "* Convolutional Recurrent Neural Networks for Music Classification: https://github.com/keunwoochoi/icassp_2017\n",
470 |     "* Music Auto-Tagger: https://github.com/keunwoochoi/music-auto_tagging-keras\n",
471 |     "* Pre-processor: https://github.com/bmcfee/pumpp"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {},
477 |    "source": [
478 |     "### 3.1 ConvNet on MFCC\n",
479 |     "\n",
480 |     "* Architecture: [Automatic Musical Pattern Feature Extraction Using Convolutional Neural Network](http://www.iaeng.org/publication/IMECS2010/IMECS2010_pp546-550.pdf), Tom LH. Li, Antoni B. Chan and Andy HW. Chun\n",
481 |     "* Missing: track segmentation and majority voting.\n",
482 |     "* Best seen: 17.6%"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": null,
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": [
491 |     "class MfccLoader(utils.Loader):\n",
492 |     "    raw_loader = utils.FfmpegLoader(sampling_rate=22050)\n",
493 |     "    #shape = (13, 190)  # For segmented tracks.\n",
494 |     "    shape = (13, 2582)\n",
495 |     "    def load(self, filename):\n",
496 |     "        import librosa\n",
497 |     "        x = self.raw_loader.load(filename)\n",
498 |     "        # Each MFCC frame spans 23ms on the audio signal with 50% overlap with the adjacent frames.\n",
499 |     "        mfcc = librosa.feature.mfcc(x, sr=22050, n_mfcc=13, n_fft=512, hop_length=256)\n",
500 |     "        return mfcc\n",
501 |     "\n",
502 |     "loader = MfccLoader()\n",
503 |     "SampleLoader = utils.build_sample_loader(AUDIO_DIR, labels_onehot, loader)\n",
504 |     "loader.load(utils.get_audio_path(AUDIO_DIR, 2))[0].shape"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": null,
510 |    "metadata": {},
511 |    "outputs": [],
512 |    "source": [
513 |     "keras.backend.clear_session()\n",
514 |     "\n",
515 |     "model = keras.models.Sequential()\n",
516 |     "model.add(Reshape((*loader.shape, 1),  input_shape=loader.shape))\n",
517 |     "print(model.output_shape)\n",
518 |     "\n",
519 |     "model.add(Conv2D(3, 13, 10, subsample=(1, 4)))\n",
520 |     "model.add(Activation(\"relu\"))\n",
521 |     "print(model.output_shape)\n",
522 |     "\n",
523 |     "model.add(Conv2D(15, 1, 10, subsample=(1, 4)))\n",
524 |     "model.add(Activation(\"relu\"))\n",
525 |     "print(model.output_shape)\n",
526 |     "\n",
527 |     "model.add(Conv2D(65, 1, 10, subsample=(1, 4)))\n",
528 |     "model.add(Activation(\"relu\"))\n",
529 |     "print(model.output_shape)\n",
530 |     "\n",
531 |     "model.add(Flatten())\n",
532 |     "print(model.output_shape)\n",
533 |     "model.add(Dense(labels_onehot.shape[1]))\n",
534 |     "model.add(Activation(\"softmax\"))\n",
535 |     "print(model.output_shape)\n",
536 |     "\n",
537 |     "optimizer = keras.optimizers.SGD(1e-3)#lr=0.01, momentum=0.9, nesterov=True)\n",
538 |     "#optimizer = keras.optimizers.Adam()#lr=1e-5)#\n",
539 |     "model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])\n",
540 |     "\n",
541 |     "model.fit_generator(SampleLoader(train, batch_size=16), train.size, nb_epoch=20, **params)\n",
542 |     "loss = model.evaluate_generator(SampleLoader(val, batch_size=16), val.size, **params)\n",
543 |     "loss = model.evaluate_generator(SampleLoader(test, batch_size=16), test.size, **params)\n",
544 |     "#Y = model.predict_generator(loader, test.size, pickle_safe=True, nb_worker=NB_WORKER, max_q_size=5)\n",
545 |     "\n",
546 |     "loss"
547 |    ]
548 |   }
549 |  ],
550 |  "metadata": {},
551 |  "nbformat": 4,
552 |  "nbformat_minor": 1
553 | }
554 | 


--------------------------------------------------------------------------------
/analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n",
  8 |     "\n",
  9 |     "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n",
 10 |     "\n",
 11 |     "## Analysis\n",
 12 |     "\n",
 13 |     "All numbers and figures which appear in the [paper] and much more.\n",
 14 |     "\n",
 15 |     "[paper]: https://arxiv.org/abs/1612.01840"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "%matplotlib inline\n",
 25 |     "\n",
 26 |     "import IPython.display as ipd\n",
 27 |     "import numpy as np\n",
 28 |     "import pandas as pd\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import seaborn as sns\n",
 31 |     "from sklearn.preprocessing import MultiLabelBinarizer\n",
 32 |     "\n",
 33 |     "import utils\n",
 34 |     "\n",
 35 |     "sns.set_context(\"notebook\", font_scale=1.5)\n",
 36 |     "plt.rcParams['figure.figsize'] = (17, 5)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "tracks = utils.load('data/fma_metadata/tracks.csv')\n",
 46 |     "genres = utils.load('data/fma_metadata/genres.csv')\n",
 47 |     "features = utils.load('data/fma_metadata/features.csv')\n",
 48 |     "echonest = utils.load('data/fma_metadata/echonest.csv')\n",
 49 |     "\n",
 50 |     "np.testing.assert_array_equal(features.index, tracks.index)\n",
 51 |     "assert echonest.index.isin(tracks.index).all()\n",
 52 |     "\n",
 53 |     "tracks.shape, genres.shape, features.shape, echonest.shape"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## 1 Size\n",
 61 |     "\n",
 62 |     "Todo:\n",
 63 |     "* When are tracks mostly added.\n",
 64 |     "* Which tracks got deleted."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "print('{} tracks, {} artists, {} albums, {} genres'.format(\n",
 74 |     "    len(tracks), len(tracks['artist', 'id'].unique()),\n",
 75 |     "    len(tracks['album', 'id'].unique()),\n",
 76 |     "    sum(genres['#tracks'] > 0)))\n",
 77 |     "mean_duration = tracks['track', 'duration'].mean()\n",
 78 |     "print('track duration: {:.0f} days total, {:.0f} seconds average'.format(\n",
 79 |     "    sum(tracks['track', 'duration']) / 3600 / 24,\n",
 80 |     "    mean_duration))"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "dimensionality = mean_duration * 44000 * 2\n",
 90 |     "print('sample dimensionality: {:.1e}'.format(dimensionality))\n",
 91 |     "print('total size, i.e. number of audio samples: {:.1e}'.format(dimensionality * len(tracks)))"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "for subset in tracks['set', 'subset'].unique():\n",
101 |     "    indicator = tracks['set', 'subset'] <= subset\n",
102 |     "    print('{:6} {:6} tracks  {:.1f} days'.format(\n",
103 |     "        subset, sum(indicator), sum(indicator) * 30 / 3600 / 24))"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "print('{} deleted tracks (largest track_id is {})'.format(tracks.index.max() - len(tracks), tracks.index.max()))\n",
113 |     "print('First track: {}'.format(tracks['track', 'date_created'].min()))\n",
114 |     "\n",
115 |     "d = pd.DataFrame(tracks.index, index=tracks['track', 'date_created'].values)\n",
116 |     "d['indicator'] = 1\n",
117 |     "\n",
118 |     "fig, ax1 = plt.subplots()\n",
119 |     "ax2 = ax1.twinx()\n",
120 |     "\n",
121 |     "d['track_id'].plot(ax=ax1)\n",
122 |     "d['indicator'].cumsum().plot(ax=ax1)\n",
123 |     "ax1.set_ylabel('#tracks')\n",
124 |     "ax1.set_ylim(0, 160000)\n",
125 |     "\n",
126 |     "(d['indicator'] * -100).plot(ax=ax2, style='r')  # needed for no apparent reason\n",
127 |     "color = sns.color_palette('deep', 3)[2]\n",
128 |     "d['indicator'].resample('2M').sum().fillna(0).plot(ax=ax2, style='--', color=color)\n",
129 |     "ax2.set_ylabel('#tracks added')\n",
130 |     "ax2.set_ylim(500, 4500)\n",
131 |     "ax2.set_ylim(0, 4000)\n",
132 |     "ax2.grid(False)\n",
133 |     "\n",
134 |     "lns = ax1.get_lines() + [ax2.get_lines()[1]]\n",
135 |     "ax1.legend(lns, ['largest track id', '#tracks still present', '#tracks added per 2 months'], loc='lower right')\n",
136 |     "\n",
137 |     "plt.savefig('figures/growth.pdf')"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "### 1.1 Splits"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "SPLITS = ['training', 'validation', 'test']\n",
154 |     "SUBSETS = ['small', 'medium', 'large']\n",
155 |     "print('subset    #train    #val   #test  val_ratio test_ratio')\n",
156 |     "for subset in SUBSETS:\n",
157 |     "    counts = [sum((tracks['set', 'split'] == split) & (tracks['set', 'subset'] <= subset)) for split in SPLITS]\n",
158 |     "    ratios = np.array(counts[0] / counts[1:])\n",
159 |     "    print('{:8s} {:7d} {:7d} {:7d} {:8.2f} {:9.2f}'.format(subset, *counts, *ratios))"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "for subset in ['small', 'medium']:\n",
169 |     "    subset = tracks['set', 'subset'] <= subset\n",
170 |     "\n",
171 |     "    d = genres.reset_index().set_index('title')\n",
172 |     "    d = d.loc[tracks.loc[subset, ('track', 'genre_top')].unique()]\n",
173 |     "\n",
174 |     "    for split in SPLITS:\n",
175 |     "        b = tracks['set', 'split'] == split\n",
176 |     "        d['#' + split] = tracks.loc[subset & b, ('track', 'genre_top')].value_counts()\n",
177 |     "\n",
178 |     "    d['val_ratio'] = d['#training'] / d['#validation']\n",
179 |     "    d['test_ratio'] = d['#training'] / d['#test']\n",
180 |     "\n",
181 |     "    ipd.display(d.sort_values('#training', ascending=False))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "d = pd.DataFrame(index=genres.index, columns=SPLITS)\n",
191 |     "for genre in genres.index:\n",
192 |     "    b = tracks['track', 'genres_all'].map(lambda genres: genre in genres)\n",
193 |     "    d.loc[genre] = tracks.loc[b, ('set', 'split')].value_counts()\n",
194 |     "d['val_ratio'] = d['training'] / d['validation']\n",
195 |     "d['test_ratio'] = d['training'] / d['test']\n",
196 |     "d.sort_values('training', ascending=False, inplace=True)\n",
197 |     "ipd.display(d.head(10))\n",
198 |     "ipd.display(d.tail(10))"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "## 2 Metadata"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": null,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "def isnull(column, df=tracks):\n",
215 |     "    if column[1] in ['tags', 'genres', 'genres_all']:\n",
216 |     "        return df[column].apply(lambda x: len(x) == 0)\n",
217 |     "    elif df.dtypes[column] == np.int:\n",
218 |     "        return df[column] <= 0\n",
219 |     "    else:\n",
220 |     "        return df[column].isnull()\n",
221 |     "\n",
222 |     "def count(series):\n",
223 |     "    col0 = series.name[0]\n",
224 |     "    df = tracks if col0 == 'track' else tracks.drop_duplicates((col0, 'id'))\n",
225 |     "    n = (~isnull(series.name, df)).sum()\n",
226 |     "    p = n / len(df) * 100\n",
227 |     "    return n, p\n",
228 |     "\n",
229 |     "# Columns / metadata usage across dataset.\n",
230 |     "d = pd.DataFrame(index=tracks.columns.drop('set'), columns=['n', 'p'])\n",
231 |     "d = d.apply(count, axis=1)\n",
232 |     "d['n'] = d['n'].astype(np.int)\n",
233 |     "d"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "# Excerpt as example in the paper.\n",
243 |     "columns = [\n",
244 |     "    ('track', 'title'),\n",
245 |     "    ('track', 'genres_all'),\n",
246 |     "    ('track', 'genre_top'),\n",
247 |     "    ('track', 'duration'),\n",
248 |     "    ('track', 'listens'),\n",
249 |     "    ('album', 'title'),\n",
250 |     "    ('album', 'listens'),\n",
251 |     "    ('album', 'tags'),\n",
252 |     "    ('artist', 'name'),\n",
253 |     "    ('artist', 'location'),\n",
254 |     "]\n",
255 |     "\n",
256 |     "non_null = ~isnull(columns[0])\n",
257 |     "for column in columns[1:]:\n",
258 |     "    non_null &= ~isnull(column)\n",
259 |     "tids = np.random.RandomState(42).permutation(tracks.index[non_null])[:8]\n",
260 |     "\n",
261 |     "tracks.loc[tids, columns].head()\n",
262 |     "\n",
263 |     "#tracks.loc[tids, columns].to_latex('figures/tracks.tex', formatters={\n",
264 |     "#    ('artist', 'longitude'): '{:,.1f}'.format,\n",
265 |     "#    ('artist', 'latitude'): '{:,.1f}'.format,\n",
266 |     "#})"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "tracks['track', 'license'].value_counts().head(10)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "tracks['track', 'language_code'].value_counts().head(10)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "### 2.1 Technical data"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": null,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "durations = tracks['track', 'duration']\n",
301 |     "plt.figure(figsize=(10, 4))  # Poster: (7, 3)\n",
302 |     "p = sns.distplot(durations[durations.values < 800], kde=False, rug=False, color='k', hist_kws=dict(alpha=0.4))\n",
303 |     "p.set_xlabel('duration [seconds]')\n",
304 |     "p.set_ylabel('#tracks')\n",
305 |     "p.set_xlim(0, 800)  # Poster: 500\n",
306 |     "plt.tight_layout()\n",
307 |     "plt.savefig('figures/duration_distribution.pdf')\n",
308 |     "\n",
309 |     "durations.describe()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "# Uncommon bit rates are VBR encodings.\n",
319 |     "print('Common bit rates: {}'.format(tracks['track', 'bit_rate'].value_counts().head(5).index.tolist()))\n",
320 |     "print('Average bit rate: {:.0f} kbit/s'.format(tracks['track', 'bit_rate'].mean()/1000))\n",
321 |     "p = sns.distplot(tracks['track', 'bit_rate'], kde=False, rug=False)\n",
322 |     "p.set_xlabel('bit rate')\n",
323 |     "p.set_ylabel('#tracks');"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "markdown",
328 |    "metadata": {},
329 |    "source": [
330 |     "### 2.2 User data"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": null,
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "# Tags.\n",
340 |     "d1 = tracks['track', 'tags'].apply(len)\n",
341 |     "d2 = tracks.drop_duplicates(('album', 'id'))\n",
342 |     "d2 = d2['album', 'tags'].apply(len)\n",
343 |     "d3 = tracks.drop_duplicates(('artist', 'id'))\n",
344 |     "d3 = d3['artist', 'tags'].apply(len) - 1\n",
345 |     "\n",
346 |     "labels = ['track', 'album', 'artist']\n",
347 |     "for l, d in zip(labels, [d1, d2, d3]):\n",
348 |     "    print('{}: from {} to {} tags'.format(l, max(d.min(), 0), d.max()))\n",
349 |     "\n",
350 |     "MAX = 13  # Poster: 11\n",
351 |     "fig, ax1 = plt.subplots(figsize=(10, 4))  # Poster: (7, 3)\n",
352 |     "ax2 = ax1.twinx()\n",
353 |     "\n",
354 |     "ax1.hist(d1, bins=np.arange(MAX)+0.25, rwidth=0.2, color='C0', label=labels[0])\n",
355 |     "ax2.hist(d2, bins=np.arange(MAX)+0.50, rwidth=0.2, color='C1', label=labels[1])\n",
356 |     "ax2.hist(d3, bins=np.arange(MAX)+0.75, rwidth=0.2, color='C2', label=labels[2])\n",
357 |     "\n",
358 |     "ax1.set_xlabel('#tags')\n",
359 |     "ax1.set_ylabel('#tracks')\n",
360 |     "ax2.set_ylabel('#artists   /   #albums')\n",
361 |     "ax1.set_xlim(0.5, MAX-0.5)\n",
362 |     "ax1.set_xticks(range(1, MAX))\n",
363 |     "ax1.set_ylim(0, 5000)\n",
364 |     "ax2.set_ylim(0, 500)\n",
365 |     "ax1.legend(loc='upper center')\n",
366 |     "ax2.legend(loc='upper right')\n",
367 |     "ax2.grid(False)\n",
368 |     "\n",
369 |     "fig.tight_layout()\n",
370 |     "fig.savefig('figures/tag_distribution.pdf')"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "# One artist tag is often the artist name.\n",
380 |     "col = 'artist'\n",
381 |     "d = tracks.drop_duplicates((col, 'id'))\n",
382 |     "d.loc[d[col, 'tags'].apply(len) > 0, [('artist', 'name'), (col, 'tags')]].head()"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": null,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "# Listens, favorites, comments.\n",
392 |     "\n",
393 |     "def plot(col0, col1, maxval, subplot=None):\n",
394 |     "    if col0 == 'track':\n",
395 |     "        d = tracks['track']\n",
396 |     "    if col0 in ['artist', 'album']:\n",
397 |     "        d = tracks[col0].drop_duplicates('id')\n",
398 |     "    if subplot:\n",
399 |     "        plt.subplot(subplot)\n",
400 |     "    d = d[col1]\n",
401 |     "    p = sns.distplot(d[d.values < maxval], kde=False, color='k', hist_kws=dict(alpha=0.4))\n",
402 |     "    p.set_xlim(-1, maxval)\n",
403 |     "    p.set_xlabel('#' + col1)\n",
404 |     "    p.set_ylabel('#' + col0 + 's')\n",
405 |     "\n",
406 |     "plt.figure(figsize=(17, 10))\n",
407 |     "plot('track', 'listens', 10e3, 221)\n",
408 |     "plot('track', 'interest', 10e3, 222)\n",
409 |     "plot('track', 'favorites', 100, 223)\n",
410 |     "plot('track', 'comments', 20, 224)\n",
411 |     "\n",
412 |     "plt.figure(figsize=(17, 10))\n",
413 |     "plot('album', 'listens', 100e3, 221)\n",
414 |     "plot('album', 'favorites', 100, 223)\n",
415 |     "plot('album', 'comments', 20, 224)\n",
416 |     "\n",
417 |     "plt.figure(figsize=(17, 5))\n",
418 |     "plot('artist', 'favorites', 100, 121)\n",
419 |     "plot('artist', 'comments', 20, 122)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "# Same as above, formated for the paper.\n",
429 |     "plt.figure(figsize=(10, 4))  # Poster: (7, 3)\n",
430 |     "plot('album', 'listens', 40e3)  # Poster 20e3\n",
431 |     "plt.tight_layout()\n",
432 |     "plt.savefig('figures/listens_distribution.pdf')\n",
433 |     "\n",
434 |     "tracks['album', 'listens'].max()"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "# Most listened albums.\n",
444 |     "tracks['album'].groupby('id').first().sort_values('listens', ascending=False).head(10)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "markdown",
449 |    "metadata": {},
450 |    "source": [
451 |     "### 2.3 Dates"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "def plot(col0, col1):\n",
461 |     "    if col0 == 'track':\n",
462 |     "        d = tracks['track']\n",
463 |     "    if col0 in ['artist', 'album']:\n",
464 |     "        d = tracks[col0].drop_duplicates('id')\n",
465 |     "    d = pd.Series(1, index=d[col1])\n",
466 |     "    d.resample('A').sum().fillna(0).plot()\n",
467 |     "\n",
468 |     "plt.figure()\n",
469 |     "plot('track', 'date_recorded')\n",
470 |     "plot('album', 'date_released')\n",
471 |     "\n",
472 |     "plt.figure()\n",
473 |     "plot('artist', 'active_year_begin')\n",
474 |     "plot('artist', 'active_year_end')\n",
475 |     "\n",
476 |     "plt.figure()\n",
477 |     "plot('track', 'date_created')\n",
478 |     "plot('album', 'date_created')\n",
479 |     "plot('artist', 'date_created')"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": [
488 |     "# Same as above, formated for the paper.\n",
489 |     "plt.figure(figsize=(5, 4))\n",
490 |     "d = tracks['album'].drop_duplicates('id')\n",
491 |     "d = pd.Series(1, index=d['date_released'])\n",
492 |     "d = d.resample('A').sum().fillna(0)\n",
493 |     "b = d.index >= pd.to_datetime(1990, format='%Y')\n",
494 |     "b &= d.index <= pd.to_datetime(2017, format='%Y')\n",
495 |     "d[b].plot(color='k')\n",
496 |     "plt.xlabel('release year')\n",
497 |     "plt.ylabel('#albums')\n",
498 |     "plt.tight_layout()\n",
499 |     "plt.savefig('figures/album_release_year.pdf')\n",
500 |     "\n",
501 |     "d.index.min().year, d.index.max().year"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "markdown",
506 |    "metadata": {},
507 |    "source": [
508 |     "## 3 Artists & albums effect"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "for effect in ['artist', 'album']:\n",
518 |     "    d = tracks[effect, 'id'].value_counts()\n",
519 |     "    ipd.display(d.head(5))\n",
520 |     "    p = sns.distplot(d[(d.values < 50) & (d.values >= 0)], kde=False)\n",
521 |     "    p.set_xlabel('#tracks per ' + effect);\n",
522 |     "    p.set_ylabel('#' + effect + 's');"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "counts = pd.Series(index=genres.loc[genres['parent'] == 0, 'title'].values, name='#artists')\n",
532 |     "for genre in counts.index:\n",
533 |     "    counts[genre] = len(tracks.loc[tracks['track', 'genre_top'] == genre, ('artist', 'id')].unique())\n",
534 |     "counts.sort_values(ascending=False).plot.bar()\n",
535 |     "plt.ylabel('#artists');"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {},
541 |    "source": [
542 |     "## 4 Genres"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {},
549 |    "outputs": [],
550 |    "source": [
551 |     "a = set(tracks['track', 'genre_top'].unique().dropna())\n",
552 |     "b = set(genres.loc[genres['top_level'].unique(), 'title'].values)\n",
553 |     "assert a == b\n",
554 |     "\n",
555 |     "print('{} top-level genres'.format(len(a)))\n",
556 |     "genres[genres['parent'] == 0].sort_values('#tracks', ascending=False)"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "markdown",
561 |    "metadata": {},
562 |    "source": [
563 |     "Number of genres per track:\n",
564 |     "* `genres`: they have introduced a [limit of 3 genres per track](https://twitter.com/therewasaguy/status/863426542075953152) early on.\n",
565 |     "* `genres_all`: more genres per track as all coarser genres in the hierarchy are included. E.g. an Indie-Rock song is counted as a Rock song too."
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": null,
571 |    "metadata": {},
572 |    "outputs": [],
573 |    "source": [
574 |     "# Genres per track.\n",
575 |     "labels = ['genres', 'genres_all']  #, 'genres_top']\n",
576 |     "d = [tracks['track', label].map(len) for label in labels]\n",
577 |     "labels = ['{}\\nmax: {}'.format(label, d1.max()) for label, d1 in zip(labels, d)]\n",
578 |     "\n",
579 |     "for l, d1 in zip(labels, d):\n",
580 |     "    print('{} per track: from {} to {} tags'.format(l, d1.min(), d1.max()))\n",
581 |     "print('#tracks without genre: {}'.format((tracks['track', 'genres'].map(len) == 0).sum()))\n",
582 |     "\n",
583 |     "MAX = 9\n",
584 |     "fig, ax = plt.subplots(figsize=(5, 4))\n",
585 |     "ax.hist(d, bins=np.arange(MAX)-0.5, label=labels)\n",
586 |     "ax.set_xlabel('#genres per track')\n",
587 |     "ax.set_ylabel('#tracks')\n",
588 |     "ax.set_xlim(-0.5, MAX-1.5)\n",
589 |     "ax.set_xticks(range(MAX-1))\n",
590 |     "ax.set_yticklabels(['0'] + ['{}0k'.format(i) for i in range(1, 6)])\n",
591 |     "ax.legend(loc='upper right')\n",
592 |     "fig.tight_layout()\n",
593 |     "fig.savefig('figures/genres_per_track.pdf')"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": null,
599 |    "metadata": {},
600 |    "outputs": [],
601 |    "source": [
602 |     "# Number of tracks per genre (full).\n",
603 |     "d = genres[genres['#tracks'] > 2000].sort_values('#tracks', ascending=False)  # Poster: 5000\n",
604 |     "plt.figure(figsize=(10, 4))  # Poster: (7, 4)\n",
605 |     "p = sns.barplot('title', '#tracks', data=d, color='k', alpha=0.4)\n",
606 |     "p.set_xlabel('')\n",
607 |     "p.set_ylabel('#tracks')\n",
608 |     "plt.xticks(rotation=90)\n",
609 |     "plt.tight_layout()\n",
610 |     "plt.savefig('figures/genre_distribution.pdf')\n",
611 |     "\n",
612 |     "genres.loc[genres['#tracks'] > 0, '#tracks'].min(), genres['#tracks'].max()"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": null,
618 |    "metadata": {},
619 |    "outputs": [],
620 |    "source": [
621 |     "# Number of tracks per top-level genre (medium).\n",
622 |     "d = tracks[tracks['set', 'subset'] <= 'medium']\n",
623 |     "d = d['track', 'genre_top'].value_counts()\n",
624 |     "plt.figure(figsize=(10, 4))  # Poster: (7, 4)\n",
625 |     "d.plot.bar(color='k', alpha=0.4)\n",
626 |     "plt.ylabel('#tracks')\n",
627 |     "plt.xlabel('')\n",
628 |     "plt.tight_layout()\n",
629 |     "plt.savefig('figures/genre_top_distribution.pdf')\n",
630 |     "\n",
631 |     "d"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "markdown",
636 |    "metadata": {},
637 |    "source": [
638 |     "### 4.1 Genre hierarchy\n",
639 |     "\n",
640 |     "* As genres have parent genres, we can plot a tree using the [DOT] language.\n",
641 |     "* Save the full genre tree as a PDF.\n",
642 |     "\n",
643 |     "Todo:\n",
644 |     "* Color nodes according to FMA genre color.\n",
645 |     "* Better looking tree.\n",
646 |     "\n",
647 |     "[DOT]: https://en.wikipedia.org/wiki/DOT_(graph_description_language)"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": null,
653 |    "metadata": {},
654 |    "outputs": [],
655 |    "source": [
656 |     "g = utils.Genres(genres)\n",
657 |     "graph = g.create_tree([25, 31], 1)\n",
658 |     "ipd.Image(graph.create_png())"
659 |    ]
660 |   },
661 |   {
662 |    "cell_type": "code",
663 |    "execution_count": null,
664 |    "metadata": {},
665 |    "outputs": [],
666 |    "source": [
667 |     "graph = g.create_tree(14)\n",
668 |     "graph.write_pdf('figures/genre_hierarchy.pdf');\n",
669 |     "\n",
670 |     "roots = g.find_roots()\n",
671 |     "print('{} roots'.format(len(roots)))\n",
672 |     "graph = g.create_tree(roots)\n",
673 |     "graph.write_pdf('figures/genre_hierarchy.pdf');"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "markdown",
678 |    "metadata": {},
679 |    "source": [
680 |     "### 4.2 Cross-appearance\n",
681 |     "\n",
682 |     "Todo:\n",
683 |     "* Group rows and columns for better identification of related genres."
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "code",
688 |    "execution_count": null,
689 |    "metadata": {},
690 |    "outputs": [],
691 |    "source": [
692 |     "enc = MultiLabelBinarizer()\n",
693 |     "genres_indicator = enc.fit_transform(tracks['track', 'genres'])\n",
694 |     "genres_names = enc.classes_\n",
695 |     "genres_names = genres.loc[enc.classes_, 'title'].values\n",
696 |     "cross_correlation = genres_indicator.T @ genres_indicator"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "code",
701 |    "execution_count": null,
702 |    "metadata": {},
703 |    "outputs": [],
704 |    "source": [
705 |     "np.fill_diagonal(cross_correlation, 0)\n",
706 |     "\n",
707 |     "plt.figure(figsize=(28, 28))\n",
708 |     "plt.imshow(np.log(cross_correlation))\n",
709 |     "plt.yticks(range(len(genres_names)), genres_names);\n",
710 |     "plt.xticks(range(len(genres_names)), genres_names, rotation=90);"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": null,
716 |    "metadata": {},
717 |    "outputs": [],
718 |    "source": [
719 |     "cross_correlation = np.tril(cross_correlation, k=-1)\n",
720 |     "sort = np.argsort(cross_correlation.flatten())\n",
721 |     "\n",
722 |     "N = 20\n",
723 |     "indices = np.unravel_index(sort[:-N:-1], cross_correlation.shape)\n",
724 |     "for i, j in zip(*indices):\n",
725 |     "    print('{}: {} | {}'.format(cross_correlation[i, j], genres_names[i], genres_names[j]))"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "markdown",
730 |    "metadata": {},
731 |    "source": [
732 |     "## 5 Audio\n",
733 |     "\n",
734 |     "Todo: e.g. audio features (echonest / librosa, spectrograms) to show diversity."
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "markdown",
739 |    "metadata": {},
740 |    "source": [
741 |     "## 6 Features\n",
742 |     "\n",
743 |     "Todo: understand features by listening to segments who have them, e.g. <http://musicinformationretrieval.com/feature_sonification.html>."
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "code",
748 |    "execution_count": null,
749 |    "metadata": {},
750 |    "outputs": [],
751 |    "source": [
752 |     "features.head(5).style.format('{:.2f}')"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": null,
758 |    "metadata": {},
759 |    "outputs": [],
760 |    "source": [
761 |     "sns.pairplot(features.loc[:, ('mfcc', 'mean', slice('01','03'))]);\n",
762 |     "sns.pairplot(features.loc[:, ('mfcc', 'std', slice('01','03'))]);"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "markdown",
767 |    "metadata": {},
768 |    "source": [
769 |     "## 7 Echonest features"
770 |    ]
771 |   },
772 |   {
773 |    "cell_type": "code",
774 |    "execution_count": null,
775 |    "metadata": {},
776 |    "outputs": [],
777 |    "source": [
778 |     "print('Echonest features available for {} tracks.'.format(len(echonest)))"
779 |    ]
780 |   }
781 |  ],
782 |  "metadata": {},
783 |  "nbformat": 4,
784 |  "nbformat_minor": 2
785 | }
786 | 


--------------------------------------------------------------------------------
/creation.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)\n",
   8 |     "\n",
   9 |     "Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.\n",
  10 |     "\n",
  11 |     "## Creation\n",
  12 |     "\n",
  13 |     "From `raw_*.csv`, this notebook generates:\n",
  14 |     "* `tracks.csv`: per-track / album / artist metadata.\n",
  15 |     "* `genres.csv`: genre hierarchy.\n",
  16 |     "* `echonest.csv`: cleaned Echonest features.\n",
  17 |     "\n",
  18 |     "A companion script, [creation.py](creation.py):\n",
  19 |     "1. Query the [API](https://freemusicarchive.org/api) and store metadata in `raw_tracks.csv`, `raw_albums.csv`, `raw_artists.csv` and `raw_genres.csv`.\n",
  20 |     "2. Download the audio for each track.\n",
  21 |     "3. Trim the audio to 30s clips.\n",
  22 |     "4. Normalize the permissions and modification / access times.\n",
  23 |     "5. Create the `.zip` archives."
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": null,
  29 |    "metadata": {},
  30 |    "outputs": [],
  31 |    "source": [
  32 |     "import os\n",
  33 |     "import ast\n",
  34 |     "import pickle\n",
  35 |     "\n",
  36 |     "import IPython.display as ipd\n",
  37 |     "import numpy as np\n",
  38 |     "import pandas as pd\n",
  39 |     "\n",
  40 |     "import utils\n",
  41 |     "import creation"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": null,
  47 |    "metadata": {},
  48 |    "outputs": [],
  49 |    "source": [
  50 |     "AUDIO_DIR = os.environ.get('AUDIO_DIR')\n",
  51 |     "BASE_DIR = os.path.abspath(os.path.dirname(AUDIO_DIR))\n",
  52 |     "FMA_FULL = os.path.join(BASE_DIR, 'fma_full')\n",
  53 |     "FMA_LARGE = os.path.join(BASE_DIR, 'fma_large')"
  54 |    ]
  55 |   },
  56 |   {
  57 |    "cell_type": "markdown",
  58 |    "metadata": {},
  59 |    "source": [
  60 |     "## 1 Retrieve metadata and audio from FMA\n",
  61 |     "\n",
  62 |     "1. Crawl the tracks, albums and artists metadata through their [API](https://freemusicarchive.org/api).\n",
  63 |     "2. Download original `.mp3` by HTTPS for each track id (only if we don't have it already).\n",
  64 |     "\n",
  65 |     "Todo:\n",
  66 |     "* Scrap curators.\n",
  67 |     "* Download images (`track_image_file`, `album_image_file`, `artist_image_file`). Beware the quality.\n",
  68 |     "* Verify checksum for some random tracks.\n",
  69 |     "\n",
  70 |     "Dataset update:\n",
  71 |     "* To add new tracks: iterate from largest known track id to the most recent only.\n",
  72 |     "* To update user data: we need to get all tracks again."
  73 |    ]
  74 |   },
  75 |   {
  76 |    "cell_type": "code",
  77 |    "execution_count": null,
  78 |    "metadata": {},
  79 |    "outputs": [],
  80 |    "source": [
  81 |     "# ./creation.py metadata\n",
  82 |     "# ./creation.py data /path/to/fma/fma_full\n",
  83 |     "# ./creation.py clips /path/to/fma\n",
  84 |     "\n",
  85 |     "#!cat creation.py"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": null,
  91 |    "metadata": {},
  92 |    "outputs": [],
  93 |    "source": [
  94 |     "# converters={'genres': ast.literal_eval}\n",
  95 |     "tracks = pd.read_csv('raw_tracks.csv', index_col=0)\n",
  96 |     "albums = pd.read_csv('raw_albums.csv', index_col=0)\n",
  97 |     "artists = pd.read_csv('raw_artists.csv', index_col=0)\n",
  98 |     "genres = pd.read_csv('raw_genres.csv', index_col=0)\n",
  99 |     "\n",
 100 |     "not_found = pickle.load(open('not_found.pickle', 'rb'))"
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "code",
 105 |    "execution_count": null,
 106 |    "metadata": {},
 107 |    "outputs": [],
 108 |    "source": [
 109 |     "def get_fs_tids(audio_dir):\n",
 110 |     "    tids = []\n",
 111 |     "    for _, dirnames, files in os.walk(audio_dir):\n",
 112 |     "        if dirnames == []:\n",
 113 |     "            tids.extend(int(file[:-4]) for file in files)\n",
 114 |     "    return tids\n",
 115 |     "\n",
 116 |     "audio_tids = get_fs_tids(FMA_FULL)\n",
 117 |     "clips_tids = get_fs_tids(FMA_LARGE)"
 118 |    ]
 119 |   },
 120 |   {
 121 |    "cell_type": "code",
 122 |    "execution_count": null,
 123 |    "metadata": {},
 124 |    "outputs": [],
 125 |    "source": [
 126 |     "print('tracks: {} collected ({} not found, {} max id)'.format(\n",
 127 |     "    len(tracks), len(not_found['tracks']), tracks.index.max()))\n",
 128 |     "print('albums: {} collected ({} not found, {} in tracks)'.format(\n",
 129 |     "    len(albums), len(not_found['albums']), len(tracks['album_id'].unique())))\n",
 130 |     "print('artists: {} collected ({} not found, {} in tracks)'.format(\n",
 131 |     "    len(artists), len(not_found['artists']), len(tracks['artist_id'].unique())))\n",
 132 |     "print('genres: {} collected'.format(len(genres)))\n",
 133 |     "print('audio: {} collected ({} not found, {} not in tracks)'.format(\n",
 134 |     "    len(audio_tids), len(not_found['audio']), len(set(audio_tids).difference(tracks.index))))\n",
 135 |     "print('clips: {} collected ({} not found, {} not in tracks)'.format(\n",
 136 |     "    len(clips_tids), len(not_found['clips']), len(set(clips_tids).difference(tracks.index))))\n",
 137 |     "assert sum(tracks.index.isin(audio_tids)) + len(not_found['audio']) == len(tracks)\n",
 138 |     "assert sum(tracks.index.isin(clips_tids)) + len(not_found['clips']) == sum(tracks.index.isin(audio_tids))\n",
 139 |     "assert len(clips_tids) + len(not_found['clips']) + len(not_found['audio']) == len(tracks)"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": null,
 145 |    "metadata": {},
 146 |    "outputs": [],
 147 |    "source": [
 148 |     "N = 5\n",
 149 |     "ipd.display(tracks.head(N))\n",
 150 |     "ipd.display(albums.head(N))\n",
 151 |     "ipd.display(artists.head(N))\n",
 152 |     "ipd.display(genres.head(N))"
 153 |    ]
 154 |   },
 155 |   {
 156 |    "cell_type": "markdown",
 157 |    "metadata": {},
 158 |    "source": [
 159 |     "## 2 Format metadata\n",
 160 |     "\n",
 161 |     "Todo:\n",
 162 |     "* Sanitize values, e.g. list of words for tags, valid links in `artist_wikipedia_page`, remove html markup in free-form text.\n",
 163 |     "    * Clean tags. E.g. some tags are just artist names.\n",
 164 |     "* Fill metadata about encoding: length, number of samples, sample rate, bit rate, channels (mono/stereo), 16bits?.\n",
 165 |     "* Update duration from audio\n",
 166 |     "    * 2624 is marked as 05:05:50 (18350s) although it is reported as 00:21:15.15 by ffmpeg.\n",
 167 |     "    * 112067: 3714s --> 01:59:55.06, 112808: 3718s --> 01:59:59.56\n",
 168 |     "    * ffmpeg: Estimating duration from bitrate, this may be inaccurate\n",
 169 |     "    * Solution, decode the complete mp3: `ffmpeg -i input.mp3 -f null -`"
 170 |    ]
 171 |   },
 172 |   {
 173 |    "cell_type": "code",
 174 |    "execution_count": null,
 175 |    "metadata": {},
 176 |    "outputs": [],
 177 |    "source": [
 178 |     "df, column = tracks, 'tags'\n",
 179 |     "null = sum(df[column].isnull())\n",
 180 |     "print('{} null, {} non-null'.format(null, df.shape[0] - null))\n",
 181 |     "df[column].value_counts().head(10)"
 182 |    ]
 183 |   },
 184 |   {
 185 |    "cell_type": "markdown",
 186 |    "metadata": {},
 187 |    "source": [
 188 |     "### 2.1 Tracks"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "code",
 193 |    "execution_count": null,
 194 |    "metadata": {},
 195 |    "outputs": [],
 196 |    "source": [
 197 |     "drop = [\n",
 198 |     "    'license_image_file', 'license_image_file_large', 'license_parent_id', 'license_url',  # keep title only\n",
 199 |     "    'track_file', 'track_image_file',  # used to download only\n",
 200 |     "    'track_url', 'album_url', 'artist_url',  # only relevant on website\n",
 201 |     "    'track_copyright_c', 'track_copyright_p',  # present for ~1000 tracks only\n",
 202 |     "    # 'track_composer', 'track_lyricist', 'track_publisher',  # present for ~4000, <1000 and <2000 tracks\n",
 203 |     "    'track_disc_number',  # different from 1 for <1000 tracks\n",
 204 |     "    'track_explicit', 'track_explicit_notes',  # present for <4000 tracks\n",
 205 |     "    'track_instrumental'  # ~6000 tracks have a 1, there is an instrumental genre\n",
 206 |     "]\n",
 207 |     "tracks.drop(drop, axis=1, inplace=True)\n",
 208 |     "tracks.rename(columns={'license_title': 'track_license', 'tags': 'track_tags'}, inplace=True)"
 209 |    ]
 210 |   },
 211 |   {
 212 |    "cell_type": "code",
 213 |    "execution_count": null,
 214 |    "metadata": {},
 215 |    "outputs": [],
 216 |    "source": [
 217 |     "tracks['track_duration'] = tracks['track_duration'].map(creation.convert_duration)"
 218 |    ]
 219 |   },
 220 |   {
 221 |    "cell_type": "code",
 222 |    "execution_count": null,
 223 |    "metadata": {},
 224 |    "outputs": [],
 225 |    "source": [
 226 |     "def convert_datetime(df, column, format=None):\n",
 227 |     "    df[column] = pd.to_datetime(df[column], infer_datetime_format=True, format=format)\n",
 228 |     "convert_datetime(tracks, 'track_date_created')\n",
 229 |     "convert_datetime(tracks, 'track_date_recorded')"
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "code",
 234 |    "execution_count": null,
 235 |    "metadata": {},
 236 |    "outputs": [],
 237 |    "source": [
 238 |     "tracks['album_id'].fillna(-1, inplace=True)\n",
 239 |     "tracks['track_bit_rate'].fillna(-1, inplace=True)\n",
 240 |     "tracks = tracks.astype({'album_id': int, 'track_bit_rate': int})"
 241 |    ]
 242 |   },
 243 |   {
 244 |    "cell_type": "code",
 245 |    "execution_count": null,
 246 |    "metadata": {},
 247 |    "outputs": [],
 248 |    "source": [
 249 |     "def convert_genres(genres):\n",
 250 |     "    genres = ast.literal_eval(genres)\n",
 251 |     "    return [int(genre['genre_id']) for genre in genres]\n",
 252 |     "\n",
 253 |     "tracks['track_genres'].fillna('[]', inplace=True)\n",
 254 |     "tracks['track_genres'] = tracks['track_genres'].map(convert_genres)"
 255 |    ]
 256 |   },
 257 |   {
 258 |    "cell_type": "code",
 259 |    "execution_count": null,
 260 |    "metadata": {},
 261 |    "outputs": [],
 262 |    "source": [
 263 |     "tracks.columns"
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "markdown",
 268 |    "metadata": {},
 269 |    "source": [
 270 |     "### 2.2 Albums"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": null,
 276 |    "metadata": {},
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "drop = [\n",
 280 |     "    'artist_name', 'album_url', 'artist_url',  # in tracks already (though it can be different)\n",
 281 |     "    'album_handle',\n",
 282 |     "    'album_image_file', 'album_images',  # todo: shall be downloaded\n",
 283 |     "    #'album_producer', 'album_engineer',  # present for ~2400 albums only\n",
 284 |     "]\n",
 285 |     "albums.drop(drop, axis=1, inplace=True)\n",
 286 |     "albums.rename(columns={'tags': 'album_tags'}, inplace=True)"
 287 |    ]
 288 |   },
 289 |   {
 290 |    "cell_type": "code",
 291 |    "execution_count": null,
 292 |    "metadata": {},
 293 |    "outputs": [],
 294 |    "source": [
 295 |     "convert_datetime(albums, 'album_date_created')\n",
 296 |     "convert_datetime(albums, 'album_date_released')"
 297 |    ]
 298 |   },
 299 |   {
 300 |    "cell_type": "code",
 301 |    "execution_count": null,
 302 |    "metadata": {},
 303 |    "outputs": [],
 304 |    "source": [
 305 |     "albums.columns"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "markdown",
 310 |    "metadata": {},
 311 |    "source": [
 312 |     "### 2.3 Artists"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": null,
 318 |    "metadata": {},
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "drop = [\n",
 322 |     "    'artist_website', 'artist_url',  # in tracks already (though it can be different)\n",
 323 |     "    'artist_handle',\n",
 324 |     "    'artist_image_file', 'artist_images',  # todo: shall be downloaded\n",
 325 |     "    'artist_donation_url', 'artist_paypal_name', 'artist_flattr_name',  # ~1600 & ~400 & ~70, not relevant\n",
 326 |     "    'artist_contact',  # ~1500, not very useful data\n",
 327 |     "    # 'artist_active_year_begin', 'artist_active_year_end',  # ~1400, ~500 only\n",
 328 |     "    # 'artist_associated_labels',  # ~1000\n",
 329 |     "    # 'artist_related_projects',  # only ~800, but can be combined with bio\n",
 330 |     "]\n",
 331 |     "artists.drop(drop, axis=1, inplace=True)\n",
 332 |     "artists.rename(columns={'tags': 'artist_tags'}, inplace=True)"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "code",
 337 |    "execution_count": null,
 338 |    "metadata": {},
 339 |    "outputs": [],
 340 |    "source": [
 341 |     "convert_datetime(artists, 'artist_date_created')\n",
 342 |     "for column in ['artist_active_year_begin', 'artist_active_year_end']:\n",
 343 |     "    artists[column].replace(0.0, np.nan, inplace=True)\n",
 344 |     "    convert_datetime(artists, column, format='%Y.0')"
 345 |    ]
 346 |   },
 347 |   {
 348 |    "cell_type": "code",
 349 |    "execution_count": null,
 350 |    "metadata": {},
 351 |    "outputs": [],
 352 |    "source": [
 353 |     "artists.columns"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "markdown",
 358 |    "metadata": {},
 359 |    "source": [
 360 |     "### 2.4 Merge DataFrames"
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "code",
 365 |    "execution_count": null,
 366 |    "metadata": {},
 367 |    "outputs": [],
 368 |    "source": [
 369 |     "not_found['albums'].remove(None)\n",
 370 |     "not_found['albums'].append(-1)\n",
 371 |     "not_found['albums'] = [int(i) for i in not_found['albums']]\n",
 372 |     "not_found['artists'] = [int(i) for i in not_found['artists']]"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "code",
 377 |    "execution_count": null,
 378 |    "metadata": {},
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "tracks = tracks.merge(albums, left_on='album_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))\n",
 382 |     "\n",
 383 |     "n = sum(tracks['album_title_dup'].isnull())\n",
 384 |     "print('{} tracks without extended album information ({} tracks without album_id)'.format(\n",
 385 |     "    n, sum(tracks['album_id'] == -1)))\n",
 386 |     "assert sum(tracks['album_id'].isin(not_found['albums'])) == n\n",
 387 |     "assert sum(tracks['album_title'] != tracks['album_title_dup']) == n\n",
 388 |     "\n",
 389 |     "tracks.drop('album_title_dup', axis=1, inplace=True)\n",
 390 |     "assert not any('dup' in col for col in tracks.columns)"
 391 |    ]
 392 |   },
 393 |   {
 394 |    "cell_type": "code",
 395 |    "execution_count": null,
 396 |    "metadata": {},
 397 |    "outputs": [],
 398 |    "source": [
 399 |     "# Album artist can be different than track artist. Keep track artist.\n",
 400 |     "#tracks[tracks['artist_name'] != tracks['artist_name_dup']].select(lambda x: 'artist_name' in x, axis=1)"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": null,
 406 |    "metadata": {},
 407 |    "outputs": [],
 408 |    "source": [
 409 |     "tracks = tracks.merge(artists, left_on='artist_id', right_index=True, sort=False, how='left', suffixes=('', '_dup'))\n",
 410 |     "\n",
 411 |     "n = sum(tracks['artist_name_dup'].isnull())\n",
 412 |     "print('{} tracks without extended artist information'.format(n))\n",
 413 |     "assert sum(tracks['artist_id'].isin(not_found['artists'])) == n\n",
 414 |     "assert sum(tracks['artist_name'] != tracks[('artist_name_dup')]) == n\n",
 415 |     "\n",
 416 |     "tracks.drop('artist_name_dup', axis=1, inplace=True)\n",
 417 |     "assert not any('dup' in col for col in tracks.columns)"
 418 |    ]
 419 |   },
 420 |   {
 421 |    "cell_type": "code",
 422 |    "execution_count": null,
 423 |    "metadata": {},
 424 |    "outputs": [],
 425 |    "source": [
 426 |     "columns = []\n",
 427 |     "for name in tracks.columns:\n",
 428 |     "    names = name.split('_')\n",
 429 |     "    columns.append((names[0], '_'.join(names[1:])))\n",
 430 |     "tracks.columns = pd.MultiIndex.from_tuples(columns)\n",
 431 |     "assert all(label in ['track', 'album', 'artist'] for label in tracks.columns.get_level_values(0))"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "code",
 436 |    "execution_count": null,
 437 |    "metadata": {},
 438 |    "outputs": [],
 439 |    "source": [
 440 |     "# Todo: fill other columns ?\n",
 441 |     "tracks['album', 'tags'].fillna('[]', inplace=True)\n",
 442 |     "tracks['artist', 'tags'].fillna('[]', inplace=True)\n",
 443 |     "\n",
 444 |     "columns = [('album', 'favorites'), ('album', 'comments'), ('album', 'listens'), ('album', 'tracks'),\n",
 445 |     "           ('artist', 'favorites'), ('artist', 'comments')]\n",
 446 |     "for column in columns:\n",
 447 |     "    tracks[column].fillna(-1, inplace=True)\n",
 448 |     "columns = {column: int for column in columns}\n",
 449 |     "tracks = tracks.astype(columns)"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "markdown",
 454 |    "metadata": {},
 455 |    "source": [
 456 |     "## 3 Data cleaning\n",
 457 |     "\n",
 458 |     "Todo: duplicates (metadata and audio)"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "code",
 463 |    "execution_count": null,
 464 |    "metadata": {},
 465 |    "outputs": [],
 466 |    "source": [
 467 |     "def keep(index, df):\n",
 468 |     "    old = len(df)\n",
 469 |     "    df = df.loc[index]\n",
 470 |     "    new = len(df)\n",
 471 |     "    print('{} lost, {} left'.format(old - new, new))\n",
 472 |     "    return df\n",
 473 |     "\n",
 474 |     "tracks = keep(tracks.index, tracks)"
 475 |    ]
 476 |   },
 477 |   {
 478 |    "cell_type": "code",
 479 |    "execution_count": null,
 480 |    "metadata": {},
 481 |    "outputs": [],
 482 |    "source": [
 483 |     "# Audio not found or could not be trimmed.\n",
 484 |     "tracks = keep(tracks.index.difference(not_found['audio']), tracks)\n",
 485 |     "tracks = keep(tracks.index.difference(not_found['clips']), tracks)"
 486 |    ]
 487 |   },
 488 |   {
 489 |    "cell_type": "markdown",
 490 |    "metadata": {},
 491 |    "source": [
 492 |     "Errors from the `features.py` script.\n",
 493 |     "* IndexError('index 0 is out of bounds for axis 0 with size 0',)\n",
 494 |     "    * ffmpeg: Header missing\n",
 495 |     "    * ffmpeg: Could not find codec parameters for stream 0 (Audio: mp3, 0 channels, s16p): unspecified frame size. Consider increasing the value for the 'analyzeduration' and 'probesize' options\n",
 496 |     "    * tids: 117759\n",
 497 |     "* NoBackendError()\n",
 498 |     "    * ffmpeg: Format mp3 detected only with low score of 1, misdetection possible!\n",
 499 |     "    * tids: 80015, 115235\n",
 500 |     "* UserWarning('Trying to estimate tuning from empty frequency set.',)\n",
 501 |     "    * librosa error\n",
 502 |     "    * tids: 1440, 26436, 38903, 57603, 62095, 62954, 62956, 62957, 62959, 62971, 86079, 96426, 104623, 106719, 109714, 114501, 114528, 118003, 118004, 127827, 130298, 130296, 131076, 135804, 154923\n",
 503 |     "* ParameterError('Filter pass-band lies beyond Nyquist',)\n",
 504 |     "    * librosa error\n",
 505 |     "    * tids: 152204, 28106, 29166, 29167, 29169, 29168, 29170, 29171, 29172, 29173, 29179, 43903, 56757, 59361, 75461, 92346, 92345, 92347, 92349, 92350, 92351, 92353, 92348, 92352, 92354, 92355, 92356, 92358, 92359, 92361, 92360, 114448, 136486, 144769, 144770, 144771, 144773, 144774, 144775, 144778, 144776, 144777"
 506 |    ]
 507 |   },
 508 |   {
 509 |    "cell_type": "code",
 510 |    "execution_count": null,
 511 |    "metadata": {},
 512 |    "outputs": [],
 513 |    "source": [
 514 |     "# Feature extraction failed.\n",
 515 |     "FAILED = [1440, 26436, 28106, 29166, 29167, 29168, 29169, 29170, 29171, 29172,\n",
 516 |     "          29173, 29179, 38903, 43903, 56757, 57603, 59361, 62095, 62954, 62956,\n",
 517 |     "          62957, 62959, 62971, 75461, 80015, 86079, 92345, 92346, 92347, 92348,\n",
 518 |     "          92349, 92350, 92351, 92352, 92353, 92354, 92355, 92356, 92357, 92358,\n",
 519 |     "          92359, 92360, 92361, 96426, 104623, 106719, 109714, 114448, 114501,114528,\n",
 520 |     "          115235, 117759, 118003, 118004, 127827, 130296, 130298, 131076, 135804, 136486,\n",
 521 |     "          144769, 144770, 144771, 144773, 144774, 144775, 144776, 144777, 144778, 152204,\n",
 522 |     "          154923]\n",
 523 |     "tracks = keep(tracks.index.difference(FAILED), tracks)"
 524 |    ]
 525 |   },
 526 |   {
 527 |    "cell_type": "code",
 528 |    "execution_count": null,
 529 |    "metadata": {},
 530 |    "outputs": [],
 531 |    "source": [
 532 |     "# License forbids redistribution.\n",
 533 |     "tracks = keep(tracks['track', 'license'] != 'FMA-Limited: Download Only', tracks)\n",
 534 |     "print('{} licenses'.format(len(tracks[('track', 'license')].unique())))"
 535 |    ]
 536 |   },
 537 |   {
 538 |    "cell_type": "code",
 539 |    "execution_count": null,
 540 |    "metadata": {},
 541 |    "outputs": [],
 542 |    "source": [
 543 |     "#sum(tracks['track', 'title'].duplicated())"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "markdown",
 548 |    "metadata": {},
 549 |    "source": [
 550 |     "## 4 Genres"
 551 |    ]
 552 |   },
 553 |   {
 554 |    "cell_type": "code",
 555 |    "execution_count": null,
 556 |    "metadata": {},
 557 |    "outputs": [],
 558 |    "source": [
 559 |     "genres.drop(['genre_handle', 'genre_color'], axis=1, inplace=True)\n",
 560 |     "genres.rename(columns={'genre_parent_id': 'parent', 'genre_title': 'title'}, inplace=True)"
 561 |    ]
 562 |   },
 563 |   {
 564 |    "cell_type": "code",
 565 |    "execution_count": null,
 566 |    "metadata": {},
 567 |    "outputs": [],
 568 |    "source": [
 569 |     "genres['parent'].fillna(0, inplace=True)\n",
 570 |     "genres = genres.astype({'parent': int})"
 571 |    ]
 572 |   },
 573 |   {
 574 |    "cell_type": "code",
 575 |    "execution_count": null,
 576 |    "metadata": {},
 577 |    "outputs": [],
 578 |    "source": [
 579 |     "# 13 (Easy Listening) has parent 126 which is missing\n",
 580 |     "# --> a root genre on the website, although not in the genre menu\n",
 581 |     "genres.at[13, 'parent'] = 0\n",
 582 |     "\n",
 583 |     "# 580 (Abstract Hip-Hop) has parent 1172 which is missing\n",
 584 |     "# --> listed as child of Hip-Hop on the website\n",
 585 |     "genres.at[580, 'parent'] = 21\n",
 586 |     "\n",
 587 |     "# 810 (Nu-Jazz) has parent 51 which is missing\n",
 588 |     "# --> listed as child of Easy Listening on website\n",
 589 |     "genres.at[810, 'parent'] = 13\n",
 590 |     "\n",
 591 |     "# 763 (Holiday) has parent 763 which is itself\n",
 592 |     "# --> listed as child of Sound Effects on website\n",
 593 |     "genres.at[763, 'parent'] = 16\n",
 594 |     "\n",
 595 |     "# Todo: should novelty be under Experimental? It is alone on website."
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": null,
 601 |    "metadata": {},
 602 |    "outputs": [],
 603 |    "source": [
 604 |     "# Genre 806 (hiphop) should not exist. Replace it by 21 (Hip-Hop).\n",
 605 |     "print('{} tracks have genre 806'.format(\n",
 606 |     "    sum(tracks['track', 'genres'].map(lambda genres: 806 in genres))))\n",
 607 |     "def change_genre(genres):\n",
 608 |     "    return [genre if genre != 806 else 21 for genre in genres]\n",
 609 |     "tracks['track', 'genres'] = tracks['track', 'genres'].map(change_genre)\n",
 610 |     "genres.drop(806, inplace=True)"
 611 |    ]
 612 |   },
 613 |   {
 614 |    "cell_type": "code",
 615 |    "execution_count": null,
 616 |    "metadata": {},
 617 |    "outputs": [],
 618 |    "source": [
 619 |     "def get_parent(genre, track_all_genres=None):\n",
 620 |     "    parent = genres.at[genre, 'parent']\n",
 621 |     "    if track_all_genres is not None:\n",
 622 |     "        track_all_genres.append(genre)\n",
 623 |     "    return genre if parent == 0 else get_parent(parent, track_all_genres)\n",
 624 |     "\n",
 625 |     "# Get all genres, i.e. all genres encountered when walking from leafs to roots.\n",
 626 |     "def get_all_genres(track_genres):\n",
 627 |     "    track_all_genres = list()\n",
 628 |     "    for genre in track_genres:\n",
 629 |     "        get_parent(genre, track_all_genres)\n",
 630 |     "    return list(set(track_all_genres))\n",
 631 |     "\n",
 632 |     "tracks['track', 'genres_all'] = tracks['track', 'genres'].map(get_all_genres)"
 633 |    ]
 634 |   },
 635 |   {
 636 |    "cell_type": "code",
 637 |    "execution_count": null,
 638 |    "metadata": {},
 639 |    "outputs": [],
 640 |    "source": [
 641 |     "# Number of tracks per genre.\n",
 642 |     "def count_genres(subset=tracks.index):\n",
 643 |     "    count = pd.Series(0, index=genres.index)\n",
 644 |     "    for _, track_all_genres in tracks.loc[subset, ('track', 'genres_all')].items():\n",
 645 |     "        for genre in track_all_genres:\n",
 646 |     "            count[genre] += 1\n",
 647 |     "    return count\n",
 648 |     "\n",
 649 |     "genres['#tracks'] = count_genres()\n",
 650 |     "genres[genres['#tracks'] == 0]"
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": null,
 656 |    "metadata": {},
 657 |    "outputs": [],
 658 |    "source": [
 659 |     "def get_top_genre(track_genres):\n",
 660 |     "    top_genres = set(genres.at[genres.at[genre, 'top_level'], 'title'] for genre in track_genres)\n",
 661 |     "    return top_genres.pop() if len(top_genres) == 1 else np.nan\n",
 662 |     "\n",
 663 |     "# Top-level genre.\n",
 664 |     "genres['top_level'] = genres.index.map(get_parent)\n",
 665 |     "tracks['track', 'genre_top'] = tracks['track', 'genres'].map(get_top_genre)"
 666 |    ]
 667 |   },
 668 |   {
 669 |    "cell_type": "code",
 670 |    "execution_count": null,
 671 |    "metadata": {},
 672 |    "outputs": [],
 673 |    "source": [
 674 |     "genres.head(10)"
 675 |    ]
 676 |   },
 677 |   {
 678 |    "cell_type": "markdown",
 679 |    "metadata": {},
 680 |    "source": [
 681 |     "## 5 Subsets: large, medium, small"
 682 |    ]
 683 |   },
 684 |   {
 685 |    "cell_type": "markdown",
 686 |    "metadata": {},
 687 |    "source": [
 688 |     "### 5.1 Large\n",
 689 |     "\n",
 690 |     "Main characteristic: the full set with clips trimmed to a manageable size."
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "markdown",
 695 |    "metadata": {},
 696 |    "source": [
 697 |     "### 5.2 Medium\n",
 698 |     "\n",
 699 |     "Main characteristic: clean metadata (includes 1 top-level genre) and quality audio."
 700 |    ]
 701 |   },
 702 |   {
 703 |    "cell_type": "code",
 704 |    "execution_count": null,
 705 |    "metadata": {},
 706 |    "outputs": [],
 707 |    "source": [
 708 |     "fma_medium = pd.DataFrame(tracks)"
 709 |    ]
 710 |   },
 711 |   {
 712 |    "cell_type": "code",
 713 |    "execution_count": null,
 714 |    "metadata": {},
 715 |    "outputs": [],
 716 |    "source": [
 717 |     "# Missing meta-information.\n",
 718 |     "\n",
 719 |     "# Missing extended album and artist information.\n",
 720 |     "fma_medium = keep(~fma_medium['album', 'id'].isin(not_found['albums']), fma_medium)\n",
 721 |     "fma_medium = keep(~fma_medium['artist', 'id'].isin(not_found['artists']), fma_medium)\n",
 722 |     "\n",
 723 |     "# Untitled track or album.\n",
 724 |     "fma_medium = keep(~fma_medium['track', 'title'].isnull(), fma_medium)\n",
 725 |     "fma_medium = keep(fma_medium['track', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)\n",
 726 |     "fma_medium = keep(fma_medium['album', 'title'].map(lambda x: 'untitled' in x.lower()) == False, fma_medium)\n",
 727 |     "\n",
 728 |     "# One tag is often just the artist name. Tags too scarce for tracks and albums.\n",
 729 |     "#keep(fma_medium['artist', 'tags'].map(len) >= 2, fma_medium)\n",
 730 |     "\n",
 731 |     "# Too scarce.\n",
 732 |     "#fma_medium = keep(~fma_medium['album', 'information'].isnull(), fma_medium)\n",
 733 |     "#fma_medium = keep(~fma_medium['artist', 'bio'].isnull(), fma_medium)\n",
 734 |     "#fma_medium = keep(~fma_medium['artist', 'website'].isnull(), fma_medium)\n",
 735 |     "#fma_medium = keep(~fma_medium['artist', 'wikipedia_page'].isnull(), fma_medium)\n",
 736 |     "\n",
 737 |     "# Too scarce.\n",
 738 |     "#fma_medium = keep(~fma_medium['artist', 'location'].isnull(), fma_medium)\n",
 739 |     "#fma_medium = keep(~fma_medium['artist', 'latitude'].isnull(), fma_medium)\n",
 740 |     "#fma_medium = keep(~fma_medium['artist', 'longitude'].isnull(), fma_medium)"
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "code",
 745 |    "execution_count": null,
 746 |    "metadata": {},
 747 |    "outputs": [],
 748 |    "source": [
 749 |     "# Technical quality.\n",
 750 |     "# Todo: sample rate\n",
 751 |     "fma_medium = keep(fma_medium['track', 'bit_rate'] > 100000, fma_medium)\n",
 752 |     "\n",
 753 |     "# Choosing standard bit rates discards all VBR.\n",
 754 |     "#fma_medium = keep(fma_medium['track', 'bit_rate'].isin([320000, 256000, 192000, 160000, 128000]), fma_medium)"
 755 |    ]
 756 |   },
 757 |   {
 758 |    "cell_type": "code",
 759 |    "execution_count": null,
 760 |    "metadata": {},
 761 |    "outputs": [],
 762 |    "source": [
 763 |     "fma_medium = keep(fma_medium['track', 'duration'] >= 60, fma_medium)\n",
 764 |     "fma_medium = keep(fma_medium['track', 'duration'] <= 600, fma_medium)\n",
 765 |     "\n",
 766 |     "fma_medium = keep(fma_medium['album', 'tracks'] >= 1, fma_medium)\n",
 767 |     "fma_medium = keep(fma_medium['album', 'tracks'] <= 50, fma_medium)"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "code",
 772 |    "execution_count": null,
 773 |    "metadata": {},
 774 |    "outputs": [],
 775 |    "source": [
 776 |     "# Lower popularity bound.\n",
 777 |     "fma_medium = keep(fma_medium['track', 'listens'] >= 100, fma_medium)\n",
 778 |     "fma_medium = keep(fma_medium['track', 'interest'] >= 200, fma_medium)\n",
 779 |     "fma_medium = keep(fma_medium['album', 'listens'] >= 1000, fma_medium);\n",
 780 |     "\n",
 781 |     "# Favorites and comments are very scarce.\n",
 782 |     "#fma_medium = keep(fma_medium['artist', 'favorites'] >= 1, fma_medium)"
 783 |    ]
 784 |   },
 785 |   {
 786 |    "cell_type": "code",
 787 |    "execution_count": null,
 788 |    "metadata": {},
 789 |    "outputs": [],
 790 |    "source": [
 791 |     "# Targeted genre classification.\n",
 792 |     "fma_medium = keep(~fma_medium['track', 'genre_top'].isnull(), fma_medium);\n",
 793 |     "#keep(fma_medium['track', 'genres'].map(len) == 1, fma_medium);"
 794 |    ]
 795 |   },
 796 |   {
 797 |    "cell_type": "code",
 798 |    "execution_count": null,
 799 |    "metadata": {},
 800 |    "outputs": [],
 801 |    "source": [
 802 |     "# Adjust size with popularity measure. Should be of better quality.\n",
 803 |     "N_TRACKS = 25000\n",
 804 |     "\n",
 805 |     "# Observations\n",
 806 |     "# * More albums killed than artists --> be sure not to kill diversity\n",
 807 |     "# * Favorites and preterites genres differently --> do it per genre?\n",
 808 |     "# Normalization\n",
 809 |     "# * mean, median, std, max\n",
 810 |     "# * tracks per album or artist\n",
 811 |     "# Test\n",
 812 |     "# * 4/5 of same tracks were selected with various set of measures\n",
 813 |     "# * <5% diff with max and mean\n",
 814 |     "\n",
 815 |     "popularity_measures = [('track', 'listens'), ('track', 'interest')]  # ('album', 'listens')\n",
 816 |     "# ('track', 'favorites'), ('track', 'comments'),\n",
 817 |     "# ('album', 'favorites'), ('album', 'comments'),\n",
 818 |     "# ('artist', 'favorites'), ('artist', 'comments'),\n",
 819 |     "\n",
 820 |     "normalization = {measure: fma_medium[measure].max() for measure in popularity_measures}\n",
 821 |     "def popularity_measure(track):\n",
 822 |     "    return sum(track[measure] / normalization[measure] for measure in popularity_measures)\n",
 823 |     "fma_medium['popularity_measure'] = fma_medium.apply(popularity_measure, axis=1)\n",
 824 |     "fma_medium = keep(fma_medium.sort_values('popularity_measure', ascending=False).index[:N_TRACKS], fma_medium)"
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "code",
 829 |    "execution_count": null,
 830 |    "metadata": {},
 831 |    "outputs": [],
 832 |    "source": [
 833 |     "tmp = genres[genres['parent'] == 0].reset_index().set_index('title')\n",
 834 |     "tmp['#tracks_medium'] = fma_medium['track', 'genre_top'].value_counts()\n",
 835 |     "tmp.sort_values('#tracks_medium', ascending=False)"
 836 |    ]
 837 |   },
 838 |   {
 839 |    "cell_type": "markdown",
 840 |    "metadata": {},
 841 |    "source": [
 842 |     "### 5.3 Small\n",
 843 |     "\n",
 844 |     "Main characteristic: genre balanced (and echonest features).\n",
 845 |     "\n",
 846 |     "Choices:\n",
 847 |     "* 8 genres with 1000 tracks --> 8,000 tracks\n",
 848 |     "* 10 genres with 500 tracks --> 5,000 tracks\n",
 849 |     "\n",
 850 |     "Todo:\n",
 851 |     "* Download more echonest features so that all tracks can have them. Otherwise intersection of tracks with echonest features and one top-level genre is too small."
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "code",
 856 |    "execution_count": null,
 857 |    "metadata": {},
 858 |    "outputs": [],
 859 |    "source": [
 860 |     "N_GENRES = 8\n",
 861 |     "N_TRACKS = 1000\n",
 862 |     "\n",
 863 |     "top_genres = tmp.sort_values('#tracks_medium', ascending=False)[:N_GENRES].index\n",
 864 |     "fma_small = pd.DataFrame(fma_medium)\n",
 865 |     "fma_small = keep(fma_small['track', 'genre_top'].isin(top_genres), fma_small)"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "code",
 870 |    "execution_count": null,
 871 |    "metadata": {},
 872 |    "outputs": [],
 873 |    "source": [
 874 |     "to_keep = []\n",
 875 |     "for genre in top_genres:\n",
 876 |     "    subset = fma_small[fma_small['track', 'genre_top'] == genre]\n",
 877 |     "    drop = subset.sort_values('popularity_measure').index[:-N_TRACKS]\n",
 878 |     "    fma_small.drop(drop, inplace=True)\n",
 879 |     "assert len(fma_small) == N_GENRES * N_TRACKS"
 880 |    ]
 881 |   },
 882 |   {
 883 |    "cell_type": "markdown",
 884 |    "metadata": {},
 885 |    "source": [
 886 |     "### 5.4 Subset indication"
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": null,
 892 |    "metadata": {},
 893 |    "outputs": [],
 894 |    "source": [
 895 |     "SUBSETS = ('small', 'medium', 'large')\n",
 896 |     "tracks['set', 'subset'] = pd.Series().astype('category', categories=SUBSETS, ordered=True)\n",
 897 |     "tracks.loc[tracks.index, ('set', 'subset')] = 'large'\n",
 898 |     "tracks.loc[fma_medium.index, ('set', 'subset')] = 'medium'\n",
 899 |     "tracks.loc[fma_small.index, ('set', 'subset')] = 'small'"
 900 |    ]
 901 |   },
 902 |   {
 903 |    "cell_type": "markdown",
 904 |    "metadata": {},
 905 |    "source": [
 906 |     "### 5.5 Echonest"
 907 |    ]
 908 |   },
 909 |   {
 910 |    "cell_type": "code",
 911 |    "execution_count": null,
 912 |    "metadata": {},
 913 |    "outputs": [],
 914 |    "source": [
 915 |     "echonest = pd.read_csv('raw_echonest.csv', index_col=0, header=[0, 1, 2])\n",
 916 |     "echonest = keep(~echonest['echonest', 'temporal_features'].isnull().any(axis=1), echonest)\n",
 917 |     "echonest = keep(~echonest['echonest', 'audio_features'].isnull().any(axis=1), echonest)\n",
 918 |     "echonest = keep(~echonest['echonest', 'social_features'].isnull().any(axis=1), echonest)\n",
 919 |     "\n",
 920 |     "echonest = keep(echonest.index.isin(tracks.index), echonest);\n",
 921 |     "keep(echonest.index.isin(fma_medium.index), echonest);\n",
 922 |     "keep(echonest.index.isin(fma_small.index), echonest);"
 923 |    ]
 924 |   },
 925 |   {
 926 |    "cell_type": "markdown",
 927 |    "metadata": {},
 928 |    "source": [
 929 |     "## 6 Splits: training, validation, test\n",
 930 |     "\n",
 931 |     "Take into account:\n",
 932 |     "* Artists may only appear on one side.\n",
 933 |     "* Stratification: ideally, all characteristics (#tracks per artist, duration, sampling rate, information, bio) and targets (genres, tags) should be equally distributed."
 934 |    ]
 935 |   },
 936 |   {
 937 |    "cell_type": "code",
 938 |    "execution_count": null,
 939 |    "metadata": {},
 940 |    "outputs": [],
 941 |    "source": [
 942 |     "for genre in genres.index:\n",
 943 |     "    tracks['genre', genres.at[genre, 'title']] = tracks['track', 'genres_all'].map(lambda genres: genre in genres)\n",
 944 |     "\n",
 945 |     "SPLITS = ('training', 'test', 'validation')\n",
 946 |     "PERCENTAGES = (0.8, 0.1, 0.1)\n",
 947 |     "tracks['set', 'split'] = pd.Series().astype('category', categories=SPLITS)\n",
 948 |     "\n",
 949 |     "for subset in SUBSETS:\n",
 950 |     "\n",
 951 |     "    tracks_subset = tracks['set', 'subset'] <= subset\n",
 952 |     "\n",
 953 |     "    # Consider only top-level genres for small and medium.\n",
 954 |     "    genre_list = list(tracks.loc[tracks_subset, ('track', 'genre_top')].unique())\n",
 955 |     "    if subset == 'large':\n",
 956 |     "        genre_list = list(genres['title']) \n",
 957 |     "\n",
 958 |     "    while True:\n",
 959 |     "        if len(genre_list) == 0:\n",
 960 |     "            break\n",
 961 |     "\n",
 962 |     "        # Choose most constrained genre, i.e. genre with the least unassigned artists.\n",
 963 |     "        tracks_unsplit = tracks['set', 'split'].isnull()\n",
 964 |     "        count = tracks[tracks_subset & tracks_unsplit].set_index(('artist', 'id'), append=True)['genre']\n",
 965 |     "        count = count.groupby(level=1).sum().astype(np.bool).sum()\n",
 966 |     "        genre = np.argmin(count[genre_list])\n",
 967 |     "        genre_list.remove(genre)\n",
 968 |     "        \n",
 969 |     "        # Given genre, select artists.\n",
 970 |     "        tracks_genre = tracks['genre', genre] == 1\n",
 971 |     "        artists = tracks.loc[tracks_genre & tracks_subset & tracks_unsplit, ('artist', 'id')].value_counts()\n",
 972 |     "        #print('-->', genre, len(artists))\n",
 973 |     "\n",
 974 |     "        current = {split: np.sum(tracks_genre & tracks_subset & (tracks['set', 'split'] == split)) for split in SPLITS}\n",
 975 |     "\n",
 976 |     "        # Assign artists with most tracks first.\n",
 977 |     "        for artist, count in artists.items():\n",
 978 |     "            choice = np.argmin([current[split] / percentage for split, percentage in zip(SPLITS, PERCENTAGES)])\n",
 979 |     "            current[SPLITS[choice]] += count\n",
 980 |     "            #assert tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')].isnull().all()\n",
 981 |     "            tracks.loc[tracks['artist', 'id'] == artist, ('set', 'split')] = SPLITS[choice]\n",
 982 |     "\n",
 983 |     "# Tracks without genre can only serve as unlabeled data for training, e.g. for semi-supervised algorithms.\n",
 984 |     "no_genres = tracks['track', 'genres_all'].map(lambda genres: len(genres) == 0)\n",
 985 |     "no_split = tracks['set', 'split'].isnull()\n",
 986 |     "assert not (no_split & ~no_genres).any()\n",
 987 |     "tracks.loc[no_split, ('set', 'split')] = 'training'\n",
 988 |     "\n",
 989 |     "# Not needed any more.\n",
 990 |     "tracks.drop('genre', axis=1, level=0, inplace=True)"
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "markdown",
 995 |    "metadata": {},
 996 |    "source": [
 997 |     "## 7 Store"
 998 |    ]
 999 |   },
1000 |   {
1001 |    "cell_type": "code",
1002 |    "execution_count": null,
1003 |    "metadata": {},
1004 |    "outputs": [],
1005 |    "source": [
1006 |     "for dataset in 'tracks', 'genres', 'echonest':\n",
1007 |     "    eval(dataset).sort_index(axis=0, inplace=True)\n",
1008 |     "    eval(dataset).sort_index(axis=1, inplace=True)\n",
1009 |     "    params = dict(float_format='%.10f') if dataset == 'echonest' else dict()\n",
1010 |     "    eval(dataset).to_csv(dataset + '.csv', **params)"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "code",
1015 |    "execution_count": null,
1016 |    "metadata": {},
1017 |    "outputs": [],
1018 |    "source": [
1019 |     "# ./creation.py normalize /path/to/fma\n",
1020 |     "# ./creation.py zips /path/to/fma"
1021 |    ]
1022 |   },
1023 |   {
1024 |    "cell_type": "markdown",
1025 |    "metadata": {},
1026 |    "source": [
1027 |     "## 8 Description"
1028 |    ]
1029 |   },
1030 |   {
1031 |    "cell_type": "code",
1032 |    "execution_count": null,
1033 |    "metadata": {},
1034 |    "outputs": [],
1035 |    "source": [
1036 |     "tracks = utils.load('tracks.csv')\n",
1037 |     "tracks.dtypes"
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "code",
1042 |    "execution_count": null,
1043 |    "metadata": {},
1044 |    "outputs": [],
1045 |    "source": [
1046 |     "N = 5\n",
1047 |     "ipd.display(tracks['track'].head(N))\n",
1048 |     "ipd.display(tracks['album'].head(N))\n",
1049 |     "ipd.display(tracks['artist'].head(N))"
1050 |    ]
1051 |   }
1052 |  ],
1053 |  "metadata": {},
1054 |  "nbformat": 4,
1055 |  "nbformat_minor": 2
1056 | }
1057 | 


--------------------------------------------------------------------------------