."
96 |
97 | # Questions that participants must answer before they are permitted to
98 | # perform evaluation. If a multiple choice question has correct_answer
99 | # defined, the participant must select that answer to be able to continue
100 | # to the evaluation.
101 | prescreen_questions: []
102 |
103 | # Include an audio listening test
104 | listening_test:
105 |
106 | # Listening test instructions
107 | instructions: "
108 | ## **Instructions** \nMake sure your headphones are on and your volume
109 | is turned up to a comfortable level. Listen to the audio. Then, select
110 | how many tones you heard."
111 |
112 | # Number of questions to include on the listening test
113 | num_questions: 2
114 |
115 | # Number of allowed retries before the participant fails the test
116 | retries: 2
117 |
118 | # Instructions presented to the participant during evaluation
119 | survey_instructions: "
120 |
121 | ## **Instructions** \nListen to the audio file a minimum of two times.
122 | Select the words that were emphasized by the speaker. The emphasized
123 | words are those that stand out from nearby words. Play the audio and then
124 | click on a word to select (boldface) or deselect it."
125 |
126 | # Questions presented to the participant after evaluation
127 | followup_questions:
128 |
129 | # Ask participant for their native language
130 | - name: Language
131 |
132 | # The type of question. One of [free-response, multiple-choice].
133 | type: multiple-choice
134 |
135 | # Question text
136 | text: What is your native language?
137 |
138 | # Possible answers
139 | answers: [
140 | 'Albanian',
141 | 'Amharic',
142 | 'Arabic',
143 | 'Bengali',
144 | 'Berber',
145 | 'Creole',
146 | 'Dari',
147 | 'Dzongkha',
148 | 'English',
149 | 'Farsi',
150 | 'Filipino',
151 | 'French',
152 | 'German',
153 | 'Gujarati',
154 | 'Hakka',
155 | 'Hausa',
156 | 'Hebrew',
157 | 'Hindi',
158 | 'Hokkien',
159 | 'Indonesian',
160 | 'Italian',
161 | 'Japanese',
162 | 'Javanese',
163 | 'Kannada',
164 | 'Korean',
165 | 'Mandarin Chinese',
166 | 'Marathi',
167 | 'Nepali',
168 | 'Nigerian Pidgin',
169 | 'Oromo',
170 | 'Pashto',
171 | 'Patois',
172 | 'Polish',
173 | 'Portuguese',
174 | 'Russian',
175 | 'Spanish',
176 | 'Swahili',
177 | 'Somali',
178 | 'Tagalog',
179 | 'Tamil',
180 | 'Telugu',
181 | 'Thai',
182 | 'Turkish',
183 | 'Ukrainian',
184 | 'Urdu',
185 | 'Uzbek',
186 | 'Vietnamese',
187 | 'Western Punjabi',
188 | 'Wu Chinese',
189 | 'Yue Chinese',
190 | 'Other']
191 |
192 | # Ask participant for their country of origin
193 | - name: Country
194 |
195 | # The type of question. One of [free-response, multiple-choice].
196 | type: multiple-choice
197 |
198 | # Question text
199 | text: What country/region did you live in during your childhood?
200 |
201 | # Possible answers
202 | answers: [
203 | 'Afghanistan',
204 | 'Albania',
205 | 'Argentina',
206 | 'Bangladesh',
207 | 'Bhutan',
208 | 'Brazil',
209 | 'Cameroon',
210 | 'Canada',
211 | 'China',
212 | 'Colombia',
213 | 'Cuba',
214 | 'Dominican Republic',
215 | 'Ecuador',
216 | 'Egypt',
217 | 'El Salvador',
218 | 'Ethiopia',
219 | 'France',
220 | 'Germany',
221 | 'Ghana',
222 | 'Guatemala',
223 | 'Guyana',
224 | 'Haiti',
225 | 'Honduras',
226 | 'India',
227 | 'Iran',
228 | 'Iraq',
229 | 'Israel',
230 | 'Jamaica',
231 | 'Japan',
232 | 'Jordan',
233 | 'Kenya',
234 | 'Mexico',
235 | 'Morocco',
236 | 'Nepal',
237 | 'Nicaragua',
238 | 'Nigeria',
239 | 'Pakistan',
240 | 'Peru',
241 | 'Philippines',
242 | 'Poland',
243 | 'Russia',
244 | 'Somalia',
245 | 'South Korea',
246 | 'Syria',
247 | 'Taiwan',
248 | 'Thailand',
249 | 'Turkey',
250 | 'Ukraine',
251 | 'United Kingdom',
252 | 'United States',
253 | 'Uzbekistan',
254 | 'Venezuela',
255 | 'Vietnam',
256 | 'Yemen',
257 | 'Other']
258 |
--------------------------------------------------------------------------------
/emphases/assets/partitions/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/assets/partitions/.gitkeep
--------------------------------------------------------------------------------
/emphases/assets/partitions/buckeye.json:
--------------------------------------------------------------------------------
1 | {
2 | "train": [],
3 | "valid": [],
4 | "test": [
5 | "s25-1",
6 | "s04-1",
7 | "s16-1",
8 | "s26-1",
9 | "s02-1",
10 | "s03-1",
11 | "s22-1",
12 | "s32-1",
13 | "s21-1",
14 | "s24-1",
15 | "s17-1",
16 | "s14-1",
17 | "s11-1"
18 | ]
19 | }
--------------------------------------------------------------------------------
/emphases/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from . import prominence
2 | from . import duration_variance
3 | from . import pitch_variance
4 |
--------------------------------------------------------------------------------
/emphases/baselines/duration_variance/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 |
--------------------------------------------------------------------------------
/emphases/baselines/duration_variance/core.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | ###############################################################################
5 | # Duration variance baseline method
6 | ###############################################################################
7 |
8 |
9 | def infer(alignment):
10 | """Compute per-word emphasis scores using duration variance method"""
11 | # Average duration of phonemes in the sentence
12 | average_duration = alignment.duration() / len(alignment.phonemes())
13 |
14 | # Average duration of phonemes in each word
15 | average_duration_per_word = torch.tensor([
16 | word.duration() / len(word) for word in alignment])
17 |
18 | # Zero-center
19 | return (average_duration_per_word - average_duration)[None]
20 |
--------------------------------------------------------------------------------
/emphases/baselines/pitch_variance/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 |
--------------------------------------------------------------------------------
/emphases/baselines/pitch_variance/core.py:
--------------------------------------------------------------------------------
1 | import penn
2 | import torch
3 |
4 | import emphases
5 |
6 |
7 | ###############################################################################
8 | # Pitch variance method
9 | ###############################################################################
10 |
11 |
12 | def infer(alignment, audio, sample_rate, gpu=None):
13 | """Compute per-word emphasis scores using pitch variance method"""
14 | # Infer pitch and periodicity
15 | pitch, _ = penn.from_audio(
16 | audio,
17 | sample_rate,
18 | hopsize=emphases.HOPSIZE_SECONDS,
19 | fmin=emphases.FMIN,
20 | fmax=emphases.FMAX,
21 | pad=True,
22 | interp_unvoiced_at=emphases.VOICED_THRESHOLD,
23 | gpu=gpu)
24 |
25 | # Compute pitch statistics in base-two log-space
26 | pitch = torch.log2(pitch)
27 |
28 | # Compute utterance statistics
29 | utterance_spread = spread(pitch)
30 |
31 | # Compute word statistics
32 | word_spreads = []
33 | for word in alignment:
34 | start = int(emphases.convert.seconds_to_frames(word.start()))
35 | end = int(emphases.convert.seconds_to_frames(word.end()))
36 | word_spreads.append(spread(pitch[0, start:end]))
37 | word_spreads = torch.tensor(
38 | word_spreads,
39 | dtype=pitch.dtype,
40 | device=pitch.device)[None]
41 |
42 | # Zero-center
43 | return word_spreads - utterance_spread
44 |
45 |
46 | ###############################################################################
47 | # Utilities
48 | ###############################################################################
49 |
50 |
51 | def spread(pitch):
52 | """Compute pitch spread"""
53 | return torch.quantile(pitch, .95) - torch.quantile(pitch, .05)
54 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from . import cwt_utils
3 | from . import duration_processing
4 | from . import energy_processing
5 | from . import f0_processing
6 | from . import filter
7 | from . import loma
8 | from . import pitch_tracker
9 | from . import smooth_and_interp
10 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/core.py:
--------------------------------------------------------------------------------
1 | import fractions
2 |
3 | import torch
4 | import numpy as np
5 | from scipy.signal import resample_poly
6 |
7 | import emphases
8 |
9 |
10 | ###############################################################################
11 | # Prominence API
12 | ###############################################################################
13 |
14 |
15 | def infer(alignment, audio, sample_rate):
16 | """Compute per-word prominence from alignment and audio"""
17 | # Convert to numpy
18 | audio = audio.numpy()[0]
19 |
20 | # Compute energy
21 | energy = emphases.baselines.prominence.energy_processing.extract_energy(
22 | audio,
23 | sample_rate)
24 | energy = np.cbrt(energy + 1)
25 |
26 | # Smooth energy
27 | energy = emphases.baselines.prominence.smooth_and_interp.peak_smooth(
28 | energy,
29 | 30,
30 | 3)
31 | energy = emphases.baselines.prominence.smooth_and_interp.smooth(energy, 10)
32 |
33 | # Compute pitch
34 | pitch = emphases.baselines.prominence.pitch_tracker.inst_freq_pitch(
35 | audio,
36 | sample_rate)
37 | pitch = emphases.baselines.prominence.f0_processing.process(pitch)
38 |
39 | # Extract duration
40 | duration = \
41 | emphases.baselines.prominence.duration_processing.get_duration_signal(
42 | alignment,
43 | weights=[.5, .5],
44 | rate=200)
45 |
46 | # Slice features
47 | min_length = np.min([len(pitch), len(energy), len(duration)])
48 | pitch = pitch[:min_length]
49 | energy = energy[:min_length]
50 | duration = duration[:min_length]
51 |
52 | # Combine features
53 | combined = (
54 | emphases.PROMINENCE_PITCH_WEIGHT * normalize(pitch) +
55 | emphases.PROMINENCE_ENERGY_WEIGHT * normalize(energy) +
56 | emphases.PROMINENCE_DURATION_WEIGHT * normalize(duration))
57 | combined = normalize(
58 | emphases.baselines.prominence.smooth_and_interp.remove_bias(
59 | combined,
60 | 800))
61 |
62 | # Distance between adjacent scales (.25 means 4 scales per octave)
63 | scale_distance = .25 # octaves
64 |
65 | # Continuous wavelet transform analysis
66 | cwt, scales, freqs = emphases.baselines.prominence.cwt_utils.cwt_analysis(
67 | combined,
68 | mother_name='mexican_hat',
69 | period=3,
70 | num_scales=34,
71 | scale_distance=scale_distance,
72 | apply_coi=False)
73 | cwt = np.real(cwt)
74 | scales *= 200
75 |
76 | # Get scale that minimizes distance with average word length
77 | average_duration = (alignment.end() / len(alignment))*200
78 | scales = 1. / freqs * 200 * .5
79 | scale = np.argmin(np.abs(scales - average_duration))
80 |
81 | # Define the scale information
82 | pos_loma_start = scale + \
83 | int(emphases.LOMA_PROMINENCE_START / scale_distance)
84 | pos_loma_end = scale + \
85 | int(emphases.LOMA_PROMINENCE_END / scale_distance)
86 | neg_loma_start = scale + \
87 | int(emphases.LOMA_BOUNDARY_START / scale_distance)
88 | neg_loma_end = scale + \
89 | int(emphases.LOMA_BOUNDARY_END / scale_distance)
90 |
91 | # Retrieve line of maximum amplitude
92 | pos_loma = emphases.baselines.prominence.loma.get_loma(
93 | cwt,
94 | scales,
95 | pos_loma_start,
96 | pos_loma_end)
97 | neg_loma = emphases.baselines.prominence.loma.get_loma(
98 | -cwt,
99 | scales,
100 | neg_loma_start,
101 | neg_loma_end)
102 |
103 | # Decode prominence
104 | max_loma = np.array(emphases.baselines.prominence.loma.get_prominences(
105 | pos_loma,
106 | alignment,
107 | rate=200))
108 |
109 | # Prominence dimensions - [time, value]
110 | prominences = torch.tensor(max_loma)
111 |
112 | # Decode boundaries
113 | # Boundries dimensions - [time, value]
114 | boundaries = torch.tensor(emphases.baselines.prominence.loma.get_boundaries(
115 | max_loma,
116 | neg_loma,
117 | alignment))
118 |
119 | return prominences[:, 1][None]
120 |
121 |
122 | ###############################################################################
123 | # Utilities
124 | ###############################################################################
125 |
126 |
127 | def normalize(features):
128 | """Normalize features"""
129 | return (features - np.nanmean(features)) / (np.nanstd(features) + 1e-7)
130 |
131 |
132 | def resample(signal, original_sample_rate, target_sample_rate):
133 | """Resample signal"""
134 | ratio = fractions.Fraction(target_sample_rate, original_sample_rate)
135 | return resample_poly(signal, ratio.numerator, ratio.denominator)
136 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/cwt_utils.py:
--------------------------------------------------------------------------------
1 | from numpy import array, sqrt, pad, mean, pi
2 |
3 | import pycwt as cwt
4 |
5 |
6 | ###########################################################################################
7 | # Private routines
8 | ###########################################################################################
9 |
10 |
11 | def _padded_cwt(params, dt, dj, s0, J, mother, padding_len):
12 | """Private function to compute a wavelet transform on padded data
13 |
14 | Parameters
15 | ----------
16 | params: arraylike
17 | The prosodic parameters.
18 | dt: ?
19 | ?
20 | dj: ?
21 | ?
22 | s0: ?
23 | ?
24 | J: ?
25 | ?
26 | mother: ?
27 | The mother wavelet.
28 | padding_len: int
29 | The padding length
30 |
31 | Returns
32 | -------
33 | wavelet_matrix: ndarray
34 | The wavelet data resulting from the analysis
35 | scales: arraylike
36 | The scale indices corresponding to the wavelet data
37 | freqs: ?
38 | ?
39 | coi: array
40 | The cone of influence values
41 | fft: ?
42 | ?
43 | fftfreqs: ?
44 | ?
45 | """
46 | padded = pad(params, padding_len, mode='edge')
47 | wavelet_matrix, scales, freqs, coi, fft, fftfreqs = cwt.cwt(
48 | padded,
49 | dt,
50 | dj,
51 | s0,
52 | J,
53 | mother)
54 | wavelet_matrix = \
55 | wavelet_matrix[:, padding_len:len(wavelet_matrix[0]) - padding_len]
56 | return wavelet_matrix, scales, freqs, coi, fft, fftfreqs
57 |
58 |
59 | def _zero_outside_coi(wavelet_matrix, freqs, rate=200):
60 | """Private function to set each elements outside of the Cone Of Influence (coi) to 0.
61 |
62 | Parameters
63 | ----------
64 | wavelet_matrix: type
65 | description
66 | freqs: type
67 | description
68 | """
69 | for i in range(0, wavelet_matrix.shape[0]):
70 | coi = int(1. / freqs[i] * rate)
71 | wavelet_matrix[i, :coi] = 0.
72 | wavelet_matrix[i, -coi:] = 0.
73 | return wavelet_matrix
74 |
75 |
76 | def _scale_for_reconstruction(
77 | wavelet_matrix,
78 | scales,
79 | dj,
80 | dt,
81 | mother='mexican_hat',
82 | period=3):
83 | """ ?
84 |
85 | Parameters
86 | ----------
87 | wavelet_matrix: ndarray
88 | The wavelet data resulting from the analysis
89 | scales: arraylike
90 | The scale indices corresponding to the wavelet data
91 | dj: ?
92 | ?
93 | dt: ?
94 | ?
95 | mother: ?
96 | ?
97 | period: ?
98 | ?
99 | """
100 | scaled = array(wavelet_matrix)
101 |
102 | # mexican Hat
103 | c = dj / (3.541 * .867)
104 |
105 | if mother == 'morlet':
106 | cc = 1.83
107 | #periods 5 and 6 are correct, 3,4 approximate
108 | if period == 3:
109 | cc = 1.74
110 | if period == 4:
111 | cc = 1.1
112 | elif period == 5:
113 | cc = .9484
114 | elif period == 6:
115 | cc = .7784
116 | c = dj / (cc * pi ** (-.25))
117 |
118 | for i in range(0, len(scales)):
119 | scaled[i] *= c * sqrt(dt) / sqrt(scales[i])
120 | # substracting the mean should not be necessary?
121 | scaled[i] -= mean(scaled[i])
122 |
123 | return scaled
124 |
125 |
126 | def cwt_analysis(
127 | params,
128 | mother_name='mexican_hat',
129 | num_scales=12,
130 | first_scale=None,
131 | scale_distance=1.,
132 | apply_coi=True,
133 | period=5,
134 | frame_rate=200):
135 | """Achieve the continous wavelet analysis of given parameters
136 |
137 | Parameters
138 | ----------
139 | params: arraylike
140 | The parameters to analyze.
141 | mother_name: string, optional
142 | The name of the mother wavelet [default: mexican_hat].
143 | num_scales: int, optional
144 | The number of scales [default: 12].
145 | first_scale: int, optional
146 | The width of the shortest scale
147 | scale_distance: float, optional
148 | The distance between scales [default: 1.0].
149 | apply_coi: boolean, optional
150 | Apply the Cone Of Influence (coi)
151 | period: int, optional
152 | The period of the mother wavelet [default: 5].
153 | frame_rate: int, optional
154 | The signal frame rate [default: 200].
155 |
156 | Returns
157 | -------
158 | wavelet_matrix: ndarray
159 | The wavelet data resulting from the analysis
160 | scales: arraylike
161 | The scale indices corresponding to the wavelet data
162 | """
163 | # setup wavelet transform
164 | dt = 1. / float(frame_rate) # frame length
165 |
166 | if not first_scale:
167 | first_scale = dt # first scale, here frame length
168 |
169 | dj = scale_distance # distance between scales in octaves
170 | J = num_scales # number of scales
171 |
172 | mother = cwt.MexicanHat()
173 |
174 | if str.lower(mother_name) == 'morlet':
175 | mother = cwt.Morlet(period)
176 |
177 | wavelet_matrix, scales, freqs, *_ = _padded_cwt(
178 | params,
179 | dt,
180 | dj,
181 | first_scale,
182 | J,
183 | mother,
184 | 400)
185 | wavelet_matrix = _scale_for_reconstruction(
186 | wavelet_matrix,
187 | scales,
188 | dj,
189 | dt,
190 | mother=mother_name,
191 | period=period)
192 |
193 | if apply_coi:
194 | wavelet_matrix = _zero_outside_coi(wavelet_matrix, freqs, frame_rate)
195 |
196 | return wavelet_matrix, scales, freqs
197 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/duration_processing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Constants
8 | ###############################################################################
9 |
10 |
11 | SILENCE_SYMBOLS = [
12 | '#',
13 | '!pau',
14 | 'sp',
15 | '',
16 | 'pau',
17 | '!sil',
18 | 'sil',
19 | '',
20 | ' ',
21 | '',
22 | '',
23 | '.',
24 | ',',
25 | '?',
26 | '']
27 |
28 |
29 | ###############################################################################
30 | # Duration
31 | ###############################################################################
32 |
33 |
34 | def _get_dur_stats(labels, rate=200):
35 | durations = []
36 | for i in range(len(labels)):
37 | st, en, unit = labels[i]
38 | st *= rate
39 | en *= rate
40 | if unit.lower() not in SILENCE_SYMBOLS:
41 | dur = en - st
42 | dur = np.log(dur + 1.)
43 | durations.append(dur)
44 | durations = np.array(durations)
45 | return np.min(durations), np.max(durations), np.mean(durations)
46 |
47 |
48 | def get_rate(params, hp=10, lp=150):
49 | """
50 | estimation of speech rate as a center of gravity of wavelet spectrum
51 | similar to method described in "Boundary Detection using Continuous Wavelet Analysis" (2016)
52 | """
53 | params = emphases.baselines.prominence.smooth_and_interp.smooth(params, hp)
54 | params -= emphases.baselines.prominence.smooth_and_interp.smooth(params, lp)
55 |
56 | wavelet_matrix, *_ = emphases.baselines.prominence.cwt_utils.cwt_analysis(
57 | params,
58 | mother_name='Morlet',
59 | num_scales=80,
60 | scale_distance=.1,
61 | apply_coi=True,
62 | period=2)
63 | wavelet_matrix = abs(wavelet_matrix)
64 |
65 | rate = np.zeros(len(params))
66 |
67 | for i in range(0,wavelet_matrix.shape[1]):
68 | frame_en = np.sum(wavelet_matrix[:, i])
69 | # center of gravity
70 | rate[i] = np.nonzero(
71 | wavelet_matrix[:, i].cumsum() >= frame_en * .5)[0].min()
72 |
73 | return emphases.baselines.prominence.smooth_and_interp.smooth(rate, 30)
74 |
75 |
76 | def duration(labels, rate=200):
77 | """Construct duration signal from labels"""
78 | dur = np.zeros(len(labels))
79 | params = np.zeros(int(labels[-1][1] * rate))
80 | prev_end = 0
81 | min_dur, *_ = _get_dur_stats(labels, rate=200)
82 | for i in range(0, len(labels)):
83 | st, en, unit = labels[i]
84 | st *= rate
85 | en *= rate
86 | dur[i] = en - st
87 | dur[i] = np.log(dur[i] + 1.)
88 |
89 | if unit.lower() in SILENCE_SYMBOLS:
90 | dur[i] = min_dur
91 |
92 | # skip very short units, likely labelling errors
93 | if en <= st + .01:
94 | continue
95 |
96 | # unit duration -> height of the duration contour in the middle of the unit
97 | index = min(len(params) - 1, int(st + (en - st) / 2.))
98 | params[index] = dur[i]
99 |
100 | # Handle gaps in labels similarly to silences
101 | if st > prev_end and i > 1:
102 | params[int(prev_end + (st - prev_end) / 2.)] = min_dur
103 | prev_end = en
104 |
105 | # set endpoints to mean in order to avoid large "valleys"
106 | params[0] = np.mean(dur)
107 | params[-1] = np.mean(dur)
108 |
109 | # make continous duration contour and smooth a bit
110 | params = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(params, 'pchip')
111 | return emphases.baselines.prominence.smooth_and_interp.smooth(params, 20)
112 |
113 |
114 | def get_duration_signal(
115 | alignment,
116 | weights=[],
117 | rate=1):
118 | """
119 | Construct duration contour from labels. If many tiers are selected,
120 | construct contours for each tier and return a weighted sum of those
121 | """
122 | word_tier = [(word.start(), word.end(), str(word)) for word in alignment]
123 | phoneme_tier = [
124 | (phoneme.start(), phoneme.end(), str(phoneme))
125 | for phoneme in alignment.phonemes()]
126 | tiers = [phoneme_tier, word_tier]
127 |
128 | durations = []
129 |
130 | for tier in tiers:
131 | durations.append(
132 | emphases.baselines.prominence.normalize(
133 | duration(tier, rate=rate)))
134 | durations = match_length(durations)
135 | sum_durations = np.zeros(len(durations[0]))
136 | if len(weights) != len(tiers):
137 | weights = np.ones(len(tiers))
138 | for i in range(len(durations)):
139 | sum_durations += durations[i] * weights[i]
140 | return sum_durations
141 |
142 |
143 | def match_length(sig_list):
144 | """Reduce length of all signals to a the minimum one.
145 |
146 | Parameters
147 | ----------
148 | sig_list: list
149 | List of signals which are 1D array of samples.
150 | """
151 | length = min(map(len, sig_list))
152 | for i in range(0, len(sig_list)):
153 | sig_list[i] = sig_list[i][:int(length)]
154 | return sig_list
155 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/energy_processing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import emphases
4 |
5 |
6 | def extract_energy(
7 | waveform,
8 | sample_rate=16000,
9 | min_freq=emphases.PROMINENCE_ENERGY_MIN,
10 | max_freq=emphases.PROMINENCE_ENERGY_MAX,
11 | frame_rate=200):
12 | # Get butterworth bandpass filter parameters
13 | lp_waveform = emphases.baselines.prominence.filter.butter_bandpass_filter(
14 | waveform,
15 | min_freq,
16 | max_freq,
17 | sample_rate,
18 | order=5)
19 |
20 | # Compute energy
21 | energy = np.sqrt(lp_waveform ** 2)
22 |
23 | # Resample to frame rate
24 | return emphases.baselines.prominence.resample(energy, sample_rate, frame_rate)
25 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/f0_processing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import emphases
4 |
5 |
6 | def rolling_window(a, window):
7 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
8 | strides = a.strides + (a.strides[-1],)
9 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
10 |
11 |
12 | def _cut_boundary_vals(params, num_vals):
13 | cutted = np.array(params)
14 | for i in range(num_vals, len(params) - num_vals):
15 | if params[i] <= 0 and params[i + 1] > 0:
16 | for j in range(i, i + num_vals):
17 | cutted[j] = 0.
18 |
19 | if params[i] > 0 and params[i + 1] <= 0:
20 | for j in range(i - num_vals, i + 1):
21 | cutted[j] = 0.
22 |
23 | return cutted
24 |
25 |
26 | def _remove_outliers(log_pitch):
27 | fixed = np.array(log_pitch)
28 |
29 | # Remove outlier f0 values from voicing boundaries
30 | boundary_cut = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(
31 | _cut_boundary_vals(fixed, 3),
32 | 'linear')
33 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear')
34 | fixed[abs(interp - boundary_cut) > .1] = 0
35 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear')
36 |
37 | # iterative outlier removal
38 | # 1. compare current contour estimate to a smoothed contour and remove deviates larger than threshold
39 | # 2. smooth current estimate with shorter window, thighten threshold
40 | # 3. goto 1.
41 |
42 | # In practice, first handles large scale octave jump type errors,
43 | # finally small scale 'errors' like consonant perturbation effects and
44 | # other irregularities in voicing boundaries
45 | #
46 | # if this appears to remove too many correct values, increase thresholds
47 | num_iter = 30
48 | max_win_len = 100
49 | min_win_len = 10
50 | max_threshold = 3. # threshold with broad window
51 | min_threshold = .5 # threshold with shorted window
52 |
53 | _std = np.std(interp)
54 | # do not tie fixing to liveliness of the original
55 | _std = .3
56 |
57 | win_len = np.exp(
58 | np.linspace(np.log(max_win_len), np.log(min_win_len), num_iter + 1))
59 | outlier_threshold = np.linspace(
60 | _std * max_threshold,
61 | _std * min_threshold,
62 | num_iter + 1)
63 | for i in range(0, num_iter):
64 | smooth_contour = emphases.baselines.prominence.smooth_and_interp.smooth(interp, win_len[i])
65 | low_limit = smooth_contour - outlier_threshold[i]
66 | # bit more careful upwards, not to cut emphases
67 | hi_limit = smooth_contour + outlier_threshold[i] * 1.5
68 |
69 | # octave jump down fix, more harm than good?
70 | fixed[interp > hi_limit] = 0
71 | fixed[interp < low_limit] = 0
72 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear')
73 |
74 | return fixed
75 |
76 |
77 | def _interpolate(f0):
78 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(f0)
79 | _std = np.std(interp)
80 | _min = np.min(interp)
81 | low_limit = emphases.baselines.prominence.smooth_and_interp.smooth(interp, 200) - 1.5 * _std
82 | low_limit[low_limit < _min] = _min
83 | hi_limit = emphases.baselines.prominence.smooth_and_interp.smooth(interp, 100) + 2. * _std
84 | voicing = np.array(f0)
85 | constrained = np.array(f0)
86 | constrained = np.maximum(f0, low_limit)
87 | constrained = np.minimum(constrained, hi_limit)
88 | interp = emphases.baselines.prominence.smooth_and_interp.peak_smooth(
89 | constrained,
90 | 100,
91 | 20,
92 | voicing=voicing)
93 | # smooth voiced parts a bit too
94 | return emphases.baselines.prominence.smooth_and_interp.peak_smooth(interp, 3, 2)
95 |
96 |
97 | def process(f0):
98 | log_pitch = np.array(f0)
99 | log_scaled = True
100 | if np.mean(f0[f0 > 0]) > 20:
101 | log_scaled = False
102 | log_pitch[f0 > 0] = np.log(f0[f0 > 0])
103 | log_pitch[f0 <= 0] = 0
104 |
105 | log_pitch = _remove_outliers(log_pitch)
106 | log_pitch = _interpolate(log_pitch)
107 |
108 | if not log_scaled:
109 | return np.exp(log_pitch)
110 | else:
111 | return log_pitch
112 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/filter.py:
--------------------------------------------------------------------------------
1 | from scipy.signal import butter, lfilter
2 |
3 |
4 | def butter_bandpass(lowcut, highcut, fs, order=5):
5 | """Generate the butter bandpass filter
6 |
7 | For more details see scipy.signal.butter documentation
8 |
9 | Parameters
10 | ----------
11 | lowcut: int
12 | The low cut value
13 | highcut: type
14 | description
15 | fs: int
16 | Signal sample rate
17 | order: int
18 | Order of the butter fiter
19 |
20 | Returns
21 | -------
22 | b: arraylike
23 | Numerator polynomial of the IIR filter
24 | a: arraylike
25 | Denominator polynomial of the IIR filter
26 | """
27 | nyq = .5 * fs
28 | low = lowcut / nyq
29 | if highcut >= nyq * .95:
30 | highcut = nyq * .95
31 | high = highcut / nyq
32 | b, a = butter(order, [low, high], btype='band')
33 | return b, a
34 |
35 |
36 | def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
37 | """Filter signal data using a butter filter type
38 |
39 | For more details see scipy.signal.butter and scipy.signal.lfilter documentation
40 |
41 | Parameters
42 | ----------
43 | data: arraylike
44 | An N-dimensional input array.
45 | lowcut: int
46 | The lowcut filtering value.
47 | highcut: type
48 | The highcut filtering value.
49 | fs: int
50 | The signal sample rate.
51 | order: int
52 | The order of the butter filter.
53 |
54 | Returns
55 | -------
56 | arraylike
57 | An N-dimensional filtered array
58 | """
59 | b, a = butter_bandpass(lowcut, highcut, fs, order=order)
60 | return lfilter(b, a, data)
61 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/loma.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from operator import itemgetter
3 |
4 |
5 | def simplify(loma):
6 | """?
7 | Parameters
8 | ----------
9 | loma: type
10 | description
11 | """
12 | simplified = []
13 | for l in loma:
14 | # align loma to it's position in the middle of the line
15 | pos = l[int(len(l) / 2.)][0]
16 | strength = l[-1][1]
17 | simplified.append((pos, strength))
18 | return simplified
19 |
20 |
21 | def get_prominences(pos_loma, alignment, rate=1):
22 | """?
23 | Parameters
24 | ----------
25 | pos_loma: list of ?
26 | Positive loma values
27 | labels: list of tuple (float, float, string)
28 | List of labels which are lists of 3 elements [start, end, description]
29 | """
30 | max_word_loma = []
31 | loma = simplify(pos_loma)
32 | for st, end in [(word.start(), word.end()) for word in alignment]:
33 | st *= rate
34 | end *= rate
35 | word_loma = []
36 | for l in loma:
37 | if l[0] >= st and l[0] <= end:
38 | word_loma.append(l)
39 | if len(word_loma) > 0:
40 | max_word_loma.append(sorted(word_loma, key=itemgetter(1))[-1])
41 | else:
42 | max_word_loma.append([st + (end - st) / 2., 0.])
43 | return max_word_loma
44 |
45 |
46 | def get_boundaries(max_word_loma, boundary_loma, alignment):
47 | """get strongest lines of minimum amplitude between adjacent words' max lines"""
48 | boundary_loma = simplify(boundary_loma)
49 | max_boundary_loma = []
50 | st = 0
51 | end = 0
52 | for i in range(1, len(max_word_loma)):
53 | w_boundary_loma = []
54 | for l in boundary_loma:
55 | st = max_word_loma[i - 1][0]
56 | end = max_word_loma[i][0]
57 | if l[0] >= st and l[0] < end:
58 | if l[1] > 0:
59 | w_boundary_loma.append(l)
60 |
61 | if len(w_boundary_loma) > 0:
62 | max_boundary_loma.append(
63 | sorted(w_boundary_loma, key=itemgetter(1))[-1])
64 | else:
65 | max_boundary_loma.append([st + (end - st) / 2, 0])
66 |
67 | # final boundary is not estimated
68 | max_boundary_loma.append((alignment.end(), 1))
69 |
70 | return max_boundary_loma
71 |
72 |
73 | def _get_parent(child_index, parent_diff, parent_indices):
74 | """Private function to find the parent of the given child peak. At child peak index, follow the
75 | slope of parent scale upwards to find parent
76 |
77 | Parameters
78 | ----------
79 | child_index: int
80 | Index of the current child peak
81 | parent_diff: list of ?
82 | ?
83 | parent_indices: list of int ?
84 | Indices of available parents
85 |
86 | Returns
87 | _______
88 | int
89 | The parent index or None if there is no parent
90 | """
91 | for i in range(0, len(parent_indices)):
92 | if parent_indices[i] > child_index:
93 | if parent_diff[int(child_index)] > 0:
94 | return parent_indices[i]
95 | else:
96 | if i > 0:
97 | return parent_indices[i - 1]
98 | else:
99 | return parent_indices[0]
100 |
101 | if len(parent_indices) > 0:
102 | return parent_indices[-1]
103 |
104 |
105 | def get_loma(wavelet_matrix, scales, min_scale, max_scale):
106 | """Get the Line Of Maximum Amplitude (loma)
107 |
108 | Parameters
109 | ----------
110 | wavelet_matrix: matrix of float
111 | The wavelet matrix
112 | scales: list of int
113 | The list of scales
114 | min_scale: int
115 | The minimum scale
116 | max_scale: int
117 | The maximum scale
118 |
119 | Returns
120 | -------
121 | list of tuples
122 | ?
123 |
124 | Note
125 | ----
126 | change this so that one level is done in one chunk, not one parent.
127 | """
128 | min_peak = -10000. # minimum peak amplitude to consider. NOTE:this has no meaning unless scales normalized
129 | max_dist = 10 # how far in time to look for parent peaks. NOTE: frame rate and scale dependent
130 |
131 | # get peaks from the first scale
132 | peaks, indices = get_peaks(wavelet_matrix[min_scale], min_peak)
133 |
134 | loma = dict()
135 | root = dict()
136 | for i in range(0, len(peaks)):
137 | loma[indices[i]] = []
138 |
139 | # keep track of roots of each loma
140 | root[indices[i]] = indices[i]
141 |
142 | for i in range(min_scale + 1, max_scale):
143 | max_dist = np.sqrt(scales[i]) * 4
144 |
145 | # find peaks in the parent scale
146 | p_peaks, p_indices = get_peaks(wavelet_matrix[i], min_peak)
147 | parents = dict(zip(p_indices, p_peaks))
148 |
149 | # find a parent for each child peak
150 | children = dict()
151 | for p in p_indices:
152 | children[p] = []
153 |
154 | parent_diff = np.diff(wavelet_matrix[i], 1)
155 | for j in range(0, len(indices)):
156 | parent =_get_parent(indices[j], parent_diff, p_indices)
157 | if parent:
158 | if abs(parent - indices[j]) < max_dist and peaks[j] > min_peak:
159 | children[parent].append([indices[j], peaks[j]])
160 |
161 | # for each parent, select max child
162 | peaks = []
163 | indices = []
164 | for p in children:
165 | if len(children[p]) > 0:
166 | maxi = sorted(children[p], key=itemgetter(1))[-1]
167 | indices.append(p)
168 | peaks.append(maxi[1] + parents[p])
169 |
170 | #append child to correct loma
171 | loma[root[maxi[0]]].append([maxi[0], maxi[1] + parents[p], i, p])
172 | root[p] = root[maxi[0]]
173 |
174 | sorted_loma = []
175 | for k in sorted(loma.keys()):
176 | if len(loma[k]) > 0:
177 | sorted_loma.append(loma[k])
178 |
179 | return sorted_loma
180 |
181 |
182 | def get_peaks(params, threshold=-10):
183 | """Find the peaks based on the given prosodic parameters.
184 |
185 | Parameters
186 | ----------
187 | params: ?
188 | Prosodic parameters
189 | threshold: int
190 | description
191 |
192 | Returns
193 | -------
194 | peaks: arraylike
195 | array of peak values and peak indices
196 | """
197 | indices = (np.diff(np.sign(np.diff(params))) < 0).nonzero()[0] + 1
198 | peaks = params[indices]
199 | return np.array([peaks[peaks > threshold], indices[peaks > threshold]])
200 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/pitch_tracker.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import scipy.signal
4 |
5 | import emphases
6 |
7 |
8 | def _get_f0(spec, energy, min_hz, max_hz, thresh, sil_thresh):
9 | """
10 | return frequency bin with maximum energy, if it is over given threshold
11 | and overall energy of the frame is over silence threshsold
12 | otherwise return 0 (unvoiced)
13 | """
14 | cand = int(min_hz) + np.argmax(spec[int(min_hz):int(max_hz)])
15 | if spec[cand] > thresh and energy > sil_thresh:
16 | if cand > 2 * min_hz and spec[int(round(cand / 2.))] > spec[cand] * .5:
17 | return int(round(cand / 2.))
18 | else:
19 | return cand
20 | return 0
21 |
22 |
23 | def _track_pitch(
24 | pic,
25 | min_hz=50,
26 | max_hz=450,
27 | thresh=.1,
28 | energy_thresh=1.):
29 | """
30 | extract pitch contour from time-frequency image
31 | bin with maximum energy / frame is chosen as a first f0 estimate,
32 | following with refinement steps based on the assumption of continuity of the pitch track
33 | """
34 | pitch = np.zeros(pic.shape[0])
35 |
36 | # calc energy threshold for voicing
37 | log_energy = np.log(np.sum(pic, axis=1))
38 | energy_thresh = \
39 | np.min(emphases.baselines.prominence.smooth_and_interp.smooth(log_energy, 20)) + energy_thresh
40 | pic_smooth = pic * scipy.ndimage.gaussian_filter(pic, [2, 5])
41 |
42 | # find frequency bins with max_energy
43 | for i in range(0, pic_smooth.shape[0]):
44 | pitch[i] = _get_f0(
45 | pic_smooth[i],
46 | log_energy[i],
47 | min_hz,
48 | max_hz,
49 | thresh,
50 | energy_thresh)
51 |
52 | # second pass with soft constraints
53 | n_iters = 3
54 | from scipy.signal import gaussian
55 |
56 | for iter in range(0, n_iters):
57 | smoothed = emphases.baselines.prominence.f0_processing.process(pitch)
58 | smoothed = emphases.baselines.prominence.smooth_and_interp.smooth(smoothed, int(200. / (iter + 1.)))
59 |
60 | # gradually thightening gaussian window centered on current estimate to softly constrain next iteration
61 | win_len = 800
62 | g_window = gaussian(win_len, int(np.mean(smoothed) * (1. / (iter + 1.) ** 2)))
63 |
64 | for i in range(0, pic.shape[0]):
65 | window = np.zeros(len(pic_smooth[i]))
66 | st = int(np.max((0, int(smoothed[i] - win_len))))
67 | end = int(np.min((int(smoothed[i] + win_len * .5), win_len - st)))
68 | window[st:end] = g_window[win_len - end:]
69 | pitch[i] = _get_f0(
70 | pic_smooth[i] * window, log_energy[i],
71 | min_hz,
72 | max_hz,
73 | thresh,
74 | energy_thresh)
75 |
76 | return pitch
77 |
78 |
79 | def _assign_to_bins(pic, freqs, mags):
80 | for i in range(1, freqs.shape[0] - 1):
81 | for j in range(0, freqs.shape[1]):
82 | try:
83 | pic[j, int(freqs[i, j])] += mags[i, j]
84 | except:
85 | pass
86 |
87 |
88 | def inst_freq_pitch(
89 | wav_form,
90 | fs,
91 | min_hz=emphases.FMIN,
92 | max_hz=emphases.FMAX,
93 | voicing_thresh=emphases.VOICED_THRESHOLD,
94 | target_rate=200):
95 | """Extract speech f0 using the continuous wavelet transform"""
96 | voicing_thresh = (voicing_thresh - 50.) / 100.
97 | sample_rate = 4000
98 | tmp_wav_form = emphases.baselines.prominence.resample(wav_form, fs, sample_rate)
99 | tmp_wav_form = emphases.baselines.prominence.normalize(tmp_wav_form)
100 |
101 | DEC = int(round(sample_rate / target_rate))
102 |
103 | pic = np.zeros(
104 | shape=(int(len(tmp_wav_form) / float(DEC)), int(sample_rate / 4.)))
105 |
106 | # use continuous wavelet transform to get instantenous frequencies
107 | # integrate analyses with morlet mother wavelets with period = 5 for
108 | # good time and frequency resolution
109 | # setup wavelet
110 | s0 = 2. / sample_rate
111 | dj = .05 # 20 scales per octave
112 | J = 120 # six octaves
113 | dt = 1. / sample_rate
114 | periods = [5]
115 | for p in periods:
116 | wavelet_matrix, *_ = emphases.baselines.prominence.cwt_utils.cwt_analysis(
117 | tmp_wav_form,
118 | mother_name='morlet',
119 | first_scale=s0,
120 | num_scales=J,
121 | scale_distance=dj,
122 | apply_coi=False,
123 | period=p,
124 | frame_rate=sample_rate)
125 |
126 | # hilbert transform
127 | phase = np.unwrap(np.angle(wavelet_matrix), axis=1)
128 | freqs = np.abs((np.gradient(phase, dt)[1]) / (2. * np.pi))
129 |
130 | freqs = scipy.signal.decimate(freqs, DEC, zero_phase=True)
131 | mags = scipy.signal.decimate(abs(wavelet_matrix), DEC, zero_phase=True)
132 |
133 | # normalize magnitudes
134 | mags = (mags - mags.min()) / mags.ptp()
135 |
136 | # construct time-frequency image
137 | _assign_to_bins(pic, freqs, mags)
138 |
139 | # perform frequency domain autocorrelation to enhance f0
140 | pic = scipy.ndimage.filters.gaussian_filter(pic, [1, 1])
141 | length = np.min((max_hz * 3, pic.shape[1])).astype(int)
142 |
143 | for i in range(0, pic.shape[0]):
144 | acorr1 = np.correlate(pic[i, :length], pic[i, :length], mode='same')
145 | pic[i, :int(length / 2.)] *= acorr1[int(len(acorr1) / 2.):]
146 |
147 | return _track_pitch(pic, min_hz, max_hz, voicing_thresh)
148 |
--------------------------------------------------------------------------------
/emphases/baselines/prominence/smooth_and_interp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy import interpolate
3 |
4 |
5 | def remove_bias(params, win_len=300):
6 | return params - smooth(params, win_len)
7 |
8 |
9 | def interpolate_zeros(params, method='pchip', min_val=0):
10 | """
11 | Interpolate 0 values
12 | :param params: 1D data vector
13 | :param method:
14 | :param factor: factor for interpolation (must be integer)
15 | :return: interpolated 1D vector by a given factor
16 | """
17 | voiced = np.array(params, float)
18 | for i in range(0, len(voiced)):
19 | if voiced[i] == min_val:
20 | voiced[i] = np.nan
21 |
22 | if np.isnan(voiced[-1]):
23 | voiced[-1] = np.nanmin(voiced)
24 | if np.isnan(voiced[0]):
25 | voiced[0] = np.nanmean(voiced)
26 |
27 | not_nan = np.logical_not(np.isnan(voiced))
28 |
29 | indices = np.arange(len(voiced))
30 | if method == 'spline':
31 | interp = interpolate.UnivariateSpline(
32 | indices[not_nan],
33 | voiced[not_nan],
34 | k=2,
35 | s=0)
36 | # return voiced parts intact
37 | smoothed = interp(indices)
38 | for i in range(0, len(smoothed)):
39 | if not np.isnan(voiced[i]):
40 | smoothed[i] = params[i]
41 | return smoothed
42 |
43 | elif method == 'pchip':
44 | interp = interpolate.pchip(indices[not_nan], voiced[not_nan])
45 | else:
46 | interp = interpolate.interp1d(
47 | indices[not_nan],
48 | voiced[not_nan],
49 | method)
50 | return interp(indices)
51 |
52 |
53 | def smooth(params, win, type='HAMMING'):
54 | """gaussian type smoothing, convolution with hamming window"""
55 | win = int(win + .5)
56 | if win >= len(params) - 1:
57 | win = len(params) - 1
58 |
59 | if win % 2 == 0:
60 | win += 1
61 |
62 | s = np.r_[params[win - 1:0:-1], params, params[-1:-win:-1]]
63 |
64 | if type == 'HAMMING':
65 | w = np.hamming(win)
66 | else:
67 | w = np.ones(win)
68 |
69 | y = np.convolve(w / w.sum(), s, mode='valid')
70 | return y[int(win / 2):-int(win / 2)]
71 |
72 |
73 | def peak_smooth(params, max_iter, win, min_win=2, voicing=[]):
74 | """Iterative smoothing while preserving peaks, 'true envelope' -style"""
75 | smoothed = np.array(params)
76 | win_reduce = np.exp(np.linspace(np.log(win), np.log(min_win), max_iter))
77 |
78 | for i in range(0, max_iter):
79 |
80 | smoothed = np.maximum(params, smoothed)
81 |
82 | if len(voicing) > 0:
83 | smoothed = smooth(smoothed, int(win + .5))
84 | smoothed[voicing > 0] = params[voicing > 0]
85 | else:
86 | smoothed = smooth(smoothed, int(win + .5), type='rectangle')
87 |
88 | win = win_reduce[i]
89 |
90 | return smoothed
91 |
--------------------------------------------------------------------------------
/emphases/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/config/__init__.py
--------------------------------------------------------------------------------
/emphases/config/defaults.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | import torch
5 | import GPUtil
6 |
7 |
8 | ###############################################################################
9 | # Metadata
10 | ###############################################################################
11 |
12 |
13 | # Configuration name
14 | CONFIG = 'emphases'
15 |
16 |
17 | ###############################################################################
18 | # Directories
19 | ###############################################################################
20 |
21 |
22 | # Location to save assets to be bundled with pip release
23 | ASSETS_DIR = Path(__file__).parent.parent / 'assets'
24 |
25 | # Location of preprocessed features
26 | CACHE_DIR = Path(__file__).parent.parent.parent / 'data' / 'cache'
27 |
28 | # Location of datasets on disk
29 | DATA_DIR = Path(__file__).parent.parent.parent / 'data' / 'datasets'
30 |
31 | # Location to save evaluation artifacts
32 | EVAL_DIR = Path(__file__).parent.parent.parent / 'eval'
33 |
34 | # Location to save training and adaptation artifacts
35 | RUNS_DIR = Path(__file__).parent.parent.parent / 'runs'
36 |
37 | # Location of compressed datasets on disk
38 | SOURCE_DIR = Path(__file__).parent.parent.parent / 'data' / 'sources'
39 |
40 |
41 | ###############################################################################
42 | # Audio parameters
43 | ###############################################################################
44 |
45 |
46 | # The maximum representable frequency
47 | FMAX = 550.
48 |
49 | # The minumum representable frequency
50 | FMIN = 40.
51 |
52 | # The number of samples between frames
53 | HOPSIZE = 160
54 |
55 | # Minimum decibel level
56 | MIN_DB = -100.
57 |
58 | # Number of linear frequency channels
59 | NUM_FFT = 1024
60 |
61 | # Number of mel channels
62 | NUM_MELS = 80
63 |
64 | # Voiced/unvoiced threshold for pitch estimation
65 | VOICED_THRESHOLD = .1625
66 |
67 | # Reference decibel level
68 | REF_DB = 20.
69 |
70 | # The audio samling rate
71 | SAMPLE_RATE = 16000
72 |
73 | # The size of the audio analysis window
74 | WINDOW_SIZE = 1024
75 |
76 |
77 | ###############################################################################
78 | # Data parameters
79 | ###############################################################################
80 |
81 |
82 | # List of all datasets
83 | DATASETS = ['libritts']
84 |
85 | # Datasets to use for evaluation
86 | EVALUATION_DATASETS = ['libritts']
87 |
88 | # Whether to use mel features
89 | MEL_FEATURE = True
90 |
91 | # Whether to use loudness features
92 | LOUDNESS_FEATURE = False
93 |
94 | # Maximum number of allowed annotations
95 | MAX_ANNOTATIONS = None
96 |
97 | # Maximum number of training utterances
98 | MAX_TRAINING_UTTERANCES = None
99 |
100 | # Minimum number of allowed annotations
101 | MIN_ANNOTATIONS = None
102 |
103 | # Normalize input representations
104 | NORMALIZE = False
105 |
106 | # Whether to use the specified one-eighth dataset for scaling law experiments
107 | ONE_EIGHTH_UTTERANCES = False
108 |
109 | # Whether to use pitch features
110 | PITCH_FEATURE = False
111 |
112 | # Whether to use periodicity features
113 | PERIODICITY_FEATURE = False
114 |
115 | # Seed for all random number generators
116 | RANDOM_SEED = 0
117 |
118 | # Size of each partition. Must add to 1.
119 | SPLIT_SIZE_TEST = .1
120 | SPLIT_SIZE_TRAIN = .8
121 | SPLIT_SIZE_VALID = .1
122 |
123 | # Dataset to use for training
124 | TRAINING_DATASET = 'libritts'
125 |
126 | # Dataset to use for validation
127 | VALIDATION_DATASET = 'libritts'
128 |
129 |
130 | ###############################################################################
131 | # Evaluation parameters
132 | ###############################################################################
133 |
134 |
135 | # Number of steps between logging to Tensorboard
136 | LOG_INTERVAL = 100 # steps
137 |
138 | # Number of steps to perform for tensorboard logging
139 | LOG_STEPS = 32
140 |
141 | # Number of examples to plot to Tensorboard during training
142 | PLOT_EXAMPLES = 2
143 |
144 |
145 | ###############################################################################
146 | # Wavelet baseline parameters
147 | ###############################################################################
148 |
149 |
150 | # Line of maximum amplitude bounds
151 | LOMA_BOUNDARY_START = -2 # octaves
152 | LOMA_BOUNDARY_END = 1 # octaves
153 | LOMA_PROMINENCE_START = -3 # octaves
154 | LOMA_PROMINENCE_END = 0 # octaves
155 |
156 | # Weight applied to the duration
157 | PROMINENCE_DURATION_WEIGHT = .5
158 |
159 | # Maximum frequency in energy calculation
160 | PROMINENCE_ENERGY_MAX = 5000.
161 |
162 | # Minimum frequency in energy calculation
163 | PROMINENCE_ENERGY_MIN = 200.
164 |
165 | # Weight applied to the energy
166 | PROMINENCE_ENERGY_WEIGHT = 1.
167 |
168 | # Weight applied to the pitch
169 | PROMINENCE_PITCH_WEIGHT = 1.
170 |
171 | # Voiced/unvoiced threshold from 0 (all voiced) to 100 (all unvoiced)
172 | VOICED_THRESHOLD = 50
173 |
174 |
175 | ###############################################################################
176 | # Model parameters
177 | ###############################################################################
178 |
179 |
180 | # Activation function to use in convolution model
181 | ACTIVATION_FUNCTION = torch.nn.ReLU
182 |
183 | # Model architecture. One of ['convolution', 'transformer'].
184 | ARCHITECTURE = 'convolution'
185 |
186 | # Model width
187 | CHANNELS = 80
188 |
189 | # Decoder convolution kernel size
190 | DECODER_KERNEL_SIZE = 3
191 |
192 | # Dropout probability (or None to not use dropout)
193 | DROPOUT = None
194 |
195 | # Location to perform resampling from frame resolution to word resolution.
196 | # One of ['inference', 'input', 'intermediate', 'loss'].
197 | DOWNSAMPLE_LOCATION = 'intermediate'
198 |
199 | # Method to use for resampling from frame resolution to word resolution.
200 | # One of ['average', 'center', 'max', 'sum'].
201 | DOWNSAMPLE_METHOD = 'sum'
202 |
203 | # Encoder convolution kernel size
204 | ENCODER_KERNEL_SIZE = 3
205 |
206 | # Number of network layers
207 | LAYERS = 6
208 |
209 | # Method to use for inference. One of
210 | # ['neural', 'pitch-variance', 'duration-variance', 'prominence].
211 | METHOD = 'neural'
212 |
213 | # Method to use for resampling from word resolution to frame resolution.
214 | # One of ['linear', 'nearest'].
215 | UPSAMPLE_METHOD = 'linear'
216 |
217 |
218 | ###############################################################################
219 | # Training parameters
220 | ###############################################################################
221 |
222 |
223 | # Number of buckets of data lengths used by the sampler
224 | BUCKETS = 2
225 |
226 | # Loss function. One of ['bce', 'mse']
227 | LOSS = 'bce'
228 |
229 | # Maximum number of frames in one batch
230 | MAX_TRAINING_FRAMES = 75000
231 |
232 | # Number of training steps
233 | NUM_STEPS = 6000
234 |
235 | # Number of data loading worker threads
236 | try:
237 | NUM_WORKERS = int(os.cpu_count() / max(1, len(GPUtil.getGPUs())))
238 | except ValueError:
239 | NUM_WORKERS = os.cpu_count()
240 |
--------------------------------------------------------------------------------
/emphases/config/static.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Files and directories
8 | ###############################################################################
9 |
10 |
11 | # Directory to save annotation artifacts
12 | ANNOTATION_DIR = emphases.SOURCE_DIR / 'crowdsource'
13 |
14 | # Default configuration file for emphasis annotation
15 | DEFAULT_ANNOTATION_CONFIG = emphases.ASSETS_DIR / 'configs' / 'annotate.yaml'
16 |
17 | # Location to save dataset partitions
18 | PARTITION_DIR = emphases.ASSETS_DIR / 'partitions'
19 |
20 |
21 | ###############################################################################
22 | # Audio parameters
23 | ###############################################################################
24 |
25 |
26 | # The hopsize in seconds
27 | HOPSIZE_SECONDS = emphases.HOPSIZE / emphases.SAMPLE_RATE
28 |
29 | # The maximum representable frequency in log-hz
30 | LOGFMAX = torch.log2(torch.tensor(emphases.FMAX))
31 |
32 | # The minumum representable frequency in log-hz
33 | LOGFMIN = torch.log2(torch.tensor(emphases.FMIN))
34 |
35 |
36 | ###############################################################################
37 | # Model parameters
38 | ###############################################################################
39 |
40 |
41 | # Number of input features to the model
42 | NUM_FEATURES = (
43 | emphases.MEL_FEATURE * emphases.NUM_MELS +
44 | int(emphases.PITCH_FEATURE) +
45 | int(emphases.PERIODICITY_FEATURE) +
46 | int(emphases.LOUDNESS_FEATURE))
47 |
--------------------------------------------------------------------------------
/emphases/convert.py:
--------------------------------------------------------------------------------
1 | import emphases
2 |
3 |
4 | ###############################################################################
5 | # Time conversions
6 | ###############################################################################
7 |
8 |
9 | def frames_to_samples(frames):
10 | """Convert number of frames to samples"""
11 | return frames * emphases.HOPSIZE
12 |
13 |
14 | def frames_to_seconds(frames):
15 | """Convert number of frames to seconds"""
16 | return frames * emphases.HOPSIZE_SECONDS
17 |
18 |
19 | def seconds_to_frames(seconds):
20 | """Convert seconds to number of frames"""
21 | return samples_to_frames(seconds_to_samples(seconds))
22 |
23 |
24 | def seconds_to_samples(seconds):
25 | """Convert seconds to number of samples"""
26 | return seconds * emphases.SAMPLE_RATE
27 |
28 |
29 | def samples_to_frames(samples):
30 | """Convert samples to number of frames"""
31 | return samples // emphases.HOPSIZE
32 |
33 |
34 | def samples_to_seconds(samples):
35 | """Convert number of samples to seconds"""
36 | return samples / emphases.SAMPLE_RATE
37 |
--------------------------------------------------------------------------------
/emphases/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import download
2 | from . import preprocess
3 | from .collate import collate
4 | from .dataset import Dataset
5 | from .loader import loader
6 | from .sampler import sampler
7 |
--------------------------------------------------------------------------------
/emphases/data/collate.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Batch collation
8 | ###############################################################################
9 |
10 |
11 | def collate(batch):
12 | """Batch collation"""
13 | # Unpack
14 | features, scores, word_bounds, alignments, audios, stems = zip(*batch)
15 |
16 | # Get word lengths
17 | word_lengths = torch.tensor(
18 | [bounds.shape[-1] for bounds in word_bounds],
19 | dtype=torch.long)
20 | max_word_length = word_lengths.max().item()
21 |
22 | # Get frame lengths
23 | frame_lengths = torch.tensor(
24 | [feat.shape[-1] for feat in features],
25 | dtype=torch.long)
26 | max_frame_length = frame_lengths.max().item()
27 |
28 | # Network output lengths
29 | output_lengths = word_lengths
30 | max_output_length = max_word_length
31 |
32 | # Allocate padded tensors
33 | padded_features = torch.zeros(
34 | (len(features), emphases.NUM_FEATURES, max_frame_length))
35 | padded_scores = torch.zeros((len(scores), 1, max_output_length))
36 | padded_bounds = torch.zeros(
37 | (len(word_bounds), 2, max_word_length),
38 | dtype=torch.long)
39 | padded_audio = torch.zeros(
40 | (len(audios), 1, max_frame_length * emphases.HOPSIZE))
41 |
42 | # Place batch in padded tensors
43 | for (
44 | i,
45 | (bounds, audio, feat, score, frame_length, word_length, output_length)
46 | ) in enumerate(
47 | zip(
48 | word_bounds,
49 | audios,
50 | features,
51 | scores,
52 | frame_lengths,
53 | word_lengths,
54 | output_lengths)
55 | ):
56 |
57 | # Pad features
58 | padded_features[i, :, :frame_length] = feat
59 |
60 | # Pad scores
61 | padded_scores[i, :, :output_length] = score[:, :output_length]
62 |
63 | # Pad word bounds
64 | padded_bounds[i, :, :word_length] = bounds[:, :word_length]
65 |
66 | # Pad audio
67 | end_sample = frame_length * emphases.HOPSIZE
68 | padded_audio[i, :, :end_sample] = audio[:, :end_sample]
69 |
70 | return (
71 | padded_features,
72 | frame_lengths,
73 | padded_bounds,
74 | word_lengths,
75 | padded_scores,
76 | alignments,
77 | padded_audio,
78 | stems)
79 |
--------------------------------------------------------------------------------
/emphases/data/dataset.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import numpy as np
4 | import pypar
5 | import torch
6 | import torchaudio
7 |
8 | import emphases
9 |
10 |
11 | ###############################################################################
12 | # Dataset
13 | ###############################################################################
14 |
15 |
16 | class Dataset(torch.utils.data.Dataset):
17 |
18 | def __init__(self, name, partition):
19 | self.cache = emphases.CACHE_DIR / name
20 |
21 | # Get list of stems
22 | with open(emphases.PARTITION_DIR / f'{name}.json') as file:
23 | self.stems = json.load(file)[partition]
24 |
25 | # Store lengths for bucketing
26 | audio_files = [
27 | self.cache / 'audio' / f'{stem}.wav' for stem in self.stems]
28 | self.lengths = [
29 | emphases.convert.samples_to_frames(
30 | torchaudio.info(audio_file).num_frames)
31 | for audio_file in audio_files]
32 |
33 | # Total number of frames
34 | self.frames = sum(self.lengths)
35 |
36 | def __getitem__(self, index):
37 | """Retrieve the indexth item"""
38 | stem = self.stems[index]
39 |
40 | # Load alignment
41 | alignment = pypar.Alignment(
42 | self.cache / 'alignment' / f'{stem}.TextGrid')
43 |
44 | # Compute word bounds
45 | bounds = alignment.word_bounds(
46 | emphases.SAMPLE_RATE,
47 | emphases.HOPSIZE,
48 | silences=True)
49 | word_bounds = torch.cat(
50 | [torch.tensor(bound)[None] for bound in bounds]).T
51 |
52 | # Load audio
53 | audio = emphases.load.audio(self.cache / 'audio' / f'{stem}.wav')
54 |
55 | features = []
56 |
57 | # Load mels
58 | if emphases.MEL_FEATURE:
59 | features.append(torch.load(self.cache / 'mels' / f'{stem}.pt'))
60 |
61 | # Load pitch
62 | if emphases.PITCH_FEATURE:
63 | pitch = torch.load(self.cache / 'pitch' / f'{stem}-pitch.pt')
64 | if emphases.NORMALIZE:
65 | features.append(
66 | (torch.log2(pitch) - emphases.LOGFMIN) /
67 | (emphases.LOGFMAX - emphases.LOGFMIN))
68 | else:
69 | features.append(torch.log2(pitch))
70 |
71 | # Load periodicity
72 | if emphases.PERIODICITY_FEATURE:
73 | periodicity = torch.load(
74 | self.cache / 'pitch' / f'{stem}-periodicity.pt')
75 | features.append(periodicity)
76 |
77 | # Load loudness
78 | if emphases.LOUDNESS_FEATURE:
79 | loudness = torch.load(self.cache / 'loudness' / f'{stem}.pt')
80 | features.append(loudness)
81 |
82 | # Concatenate
83 | features = features[0] if len(features) == 1 else torch.cat(features)
84 |
85 | # Load per-word ground truth emphasis scores
86 | scores = torch.load(self.cache / 'scores' / f'{stem}.pt')[None]
87 |
88 | return features, scores, word_bounds, alignment, audio, stem
89 |
90 | def __len__(self):
91 | """Length of the dataset"""
92 | return len(self.stems)
93 |
94 | def buckets(self):
95 | """Partition indices into buckets based on length for sampling"""
96 | # Get the size of a bucket
97 | size = len(self) // emphases.BUCKETS
98 |
99 | # Get indices in order of length
100 | indices = np.argsort(self.lengths)
101 | lengths = np.sort(self.lengths)
102 |
103 | # Split into buckets based on length
104 | buckets = [
105 | np.stack((indices[i:i + size], lengths[i:i + size])).T
106 | for i in range(0, len(self), size)]
107 |
108 | # Concatenate partial bucket
109 | if len(buckets) == emphases.BUCKETS + 1:
110 | residual = buckets.pop()
111 | buckets[-1] = np.concatenate((buckets[-1], residual), axis=0)
112 |
113 | return buckets
114 |
--------------------------------------------------------------------------------
/emphases/data/download/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 |
--------------------------------------------------------------------------------
/emphases/data/download/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Download datasets
8 | ###############################################################################
9 |
10 |
11 | def parse_args():
12 | """Parse command-line arguments"""
13 | parser = argparse.ArgumentParser(description='Download datasets')
14 | parser.add_argument(
15 | '--datasets',
16 | nargs='+',
17 | default=emphases.DATASETS,
18 | help='The datasets to download')
19 | parser.add_argument(
20 | '--gpu',
21 | type=int,
22 | help='The index of the gpu to run inference on')
23 | return parser.parse_known_args()[0]
24 |
25 |
26 | emphases.data.download.datasets(**vars(parse_args()))
27 |
--------------------------------------------------------------------------------
/emphases/data/download/core.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import json
3 | import shutil
4 | import ssl
5 | import tarfile
6 | import urllib
7 | import yaml
8 |
9 | import pyfoal
10 | import pypar
11 | import torch
12 | import torchutil
13 | import torchaudio
14 |
15 | import emphases
16 |
17 |
18 | ###############################################################################
19 | # Constants
20 | ###############################################################################
21 |
22 |
23 | # List of tokens to filter from Buckeye annotations
24 | BUCKEYE_FILTER_LIST = [
25 | '{B_TRANS}',
26 | '{E_TRANS}',
27 | '',
28 | '',
29 | '',
30 | '',
31 | '',
32 | '',
33 | '',
34 | '',
35 | '',
36 | '',
37 | '',
38 | '',
39 | '',
40 | '',
41 | '',
42 | '',
43 | ]
44 |
45 | # Speakers selected by sorting the train-clean-100 speakers by longest total
46 | # recording duration and manually selecting speakers with more natural,
47 | # conversational (as opposed to read) prosody
48 | LIBRITTS_SPEAKERS = [
49 | # Top 5 Female (primarily by length)
50 | 40,
51 | 669,
52 | 4362,
53 | 5022,
54 | 8123,
55 |
56 | # Additional female speakers to get to 1/8th of train-clean-100
57 | 5022,
58 | 696,
59 | 6272,
60 | 5163,
61 |
62 | # Top 5 Male (primarily by length)
63 | 196,
64 | 460,
65 | 1355,
66 | 3664,
67 | 7067,
68 |
69 | # Additional male speakers to get to 1/8th of train-clean-100
70 | 405,
71 | 6437,
72 | 446,
73 | 4397
74 | ]
75 |
76 |
77 | ###############################################################################
78 | # Download datasets
79 | ###############################################################################
80 |
81 |
82 | @torchutil.notify('download')
83 | def datasets(datasets, gpu=None):
84 | """Download datasets"""
85 | for dataset in datasets:
86 | if dataset == 'automatic':
87 | automatic(gpu=gpu)
88 | elif dataset == 'buckeye':
89 | buckeye()
90 | elif dataset == 'crowdsource':
91 | crowdsource()
92 | elif dataset == 'libritts':
93 | libritts()
94 | else:
95 | raise ValueError(f'Dataset {dataset} is not defined')
96 |
97 |
98 | ###############################################################################
99 | # Individual dataset downloaders
100 | ###############################################################################
101 |
102 |
103 | def automatic(gpu=None):
104 | """Create dataset from trained model"""
105 | # Setup directories
106 | cache_directory = emphases.CACHE_DIR / 'automatic'
107 | cache_directory.mkdir(exist_ok=True, parents=True)
108 |
109 | # Create subdirectories
110 | features = ['alignment', 'audio', 'scores']
111 | for feature in features:
112 | (cache_directory / feature).mkdir(exist_ok=True, parents=True)
113 |
114 | # Get files
115 | audio_files = list(
116 | (emphases.CACHE_DIR / 'libritts' / 'audio').rglob('*.wav'))
117 | stems = [file.stem for file in audio_files]
118 |
119 | # Copy from LibriTTS cache to annotation cache
120 | for stem in stems:
121 |
122 | # Copy audio
123 | audio_file = (
124 | emphases.CACHE_DIR / 'automatic' / 'audio' / f'{stem}.wav')
125 | shutil.copyfile(
126 | emphases.CACHE_DIR / 'libritts' / 'audio' / f'{stem}.wav',
127 | audio_file)
128 |
129 | # Copy alignment
130 | shutil.copyfile(
131 | emphases.CACHE_DIR / 'libritts' / 'alignment' / f'{stem}.TextGrid',
132 | emphases.CACHE_DIR / 'automatic' / 'alignment' / f'{stem}.TextGrid')
133 |
134 | # Load alignment
135 | alignment = pypar.Alignment(
136 | emphases.CACHE_DIR / 'automatic' / 'alignment' / f'{stem}.TextGrid')
137 |
138 | # Load audio
139 | audio, _ = torchaudio.load(audio_file)
140 |
141 | # Infer scores
142 | scores = emphases.from_alignment_and_audio(
143 | alignment,
144 | audio,
145 | emphases.SAMPLE_RATE,
146 | gpu=gpu).detach().cpu()
147 |
148 | # Save scores
149 | torch.save(scores, cache_directory / 'scores' / f'{stem}.pt')
150 |
151 |
152 | def crowdsource():
153 | """Prepare crowdsourced dataset"""
154 | # Get annotation config
155 | with open(emphases.DEFAULT_ANNOTATION_CONFIG, "r") as stream:
156 | annotation_config = yaml.safe_load(stream)
157 |
158 | # Setup directories
159 | data_directory = emphases.DATA_DIR / 'crowdsource'
160 | cache_directory = emphases.CACHE_DIR / 'crowdsource'
161 | cache_directory.mkdir(exist_ok=True, parents=True)
162 |
163 | # Create subdirectories
164 | features = ['alignment', 'audio', 'scores']
165 | for feature in features:
166 | (cache_directory / feature).mkdir(exist_ok=True, parents=True)
167 |
168 | # Load annotations data
169 | annotation_data = {}
170 | for directory in data_directory.glob('*'):
171 |
172 | source_directory = directory / annotation_config['name']
173 | table_directory = source_directory / 'tables'
174 |
175 | # Participant data
176 | participants = {}
177 | with open(table_directory / 'participants.csv') as file:
178 | for row in csv.DictReader(file):
179 | try:
180 |
181 | # Crowdsourced annotation
182 | participants[row['ID']] = {
183 | 'language': row['Language'],
184 | 'country': row['Country'],
185 | 'annotations': []}
186 |
187 | except KeyError as error:
188 |
189 | # Manual annotation
190 | participants[row['ID']] = {
191 | 'language': 'English',
192 | 'country': 'United States',
193 | 'annotations': []}
194 |
195 | # Response data
196 | with open(table_directory / 'responses.csv') as file:
197 | for row in csv.DictReader(file):
198 | participant = row['Participant']
199 |
200 | # Add participant
201 | if participant not in annotation_data:
202 | annotation_data[participant] = participants[participant]
203 |
204 | # Get word start and end times
205 | alignment = pypar.Alignment(
206 | emphases.CACHE_DIR /
207 | 'libritts' /
208 | 'alignment' /
209 | f'{row["Stem"]}.TextGrid')
210 | words = [
211 | (str(word).lower(), word.start(), word.end())
212 | for word in alignment
213 | if str(word) != pypar.SILENCE]
214 |
215 | # Format annotation
216 | entry = {
217 | 'stem': row['Stem'],
218 | 'score': [float(c) for c in row['Response']],
219 | 'words': words}
220 | assert len(entry['words']) == len(entry['score'])
221 |
222 | # Add annotation
223 | annotation_data[participant]['annotations'].append(entry)
224 |
225 | # Get worker ID correspondence
226 | correspondence = {}
227 | for directory in data_directory.glob('*'):
228 | file = (
229 | directory /
230 | annotation_config['name'] /
231 | 'crowdsource' /
232 | 'crowdsource.json')
233 | with open(file) as file:
234 | contents = json.load(file)
235 | for content in contents:
236 | correspondence |= {content['ParticipantID']: content['WorkerId']}
237 |
238 | # Crowdsourced annotation
239 | if correspondence:
240 |
241 | # Filter out where incomplete or > 1/3 examples have > 2/3 words selected
242 | def valid(items):
243 | if not hasattr(valid, 'count'):
244 | valid.count = 0
245 | sums = [sum(item['score']) for item in items]
246 | counts = [len(item['score']) for item in items]
247 | invalids = [s > .67 * c for s, c in zip(sums, counts)]
248 | is_valid = sum(invalids) < .33 * len(invalids)
249 | valid.count += 1 - int(is_valid)
250 | return is_valid
251 |
252 | # Join participants with same worker ID
253 | joined = {}
254 | for participant, contents in annotation_data.items():
255 |
256 | # Filter out bad batches
257 | if (
258 | len(contents['annotations']) < 20 or
259 | len(contents['annotations']) % 10 > 0 or
260 | not valid(contents['annotations'])
261 | ):
262 | continue
263 |
264 | worker = correspondence[participant]
265 | if worker in joined:
266 | joined[worker]['annotations'].extend(contents['annotations'])
267 | else:
268 | joined[worker] = contents
269 |
270 | # Manual annotation
271 | else:
272 | joined = annotation_data
273 |
274 | # Anonymize
275 | anonymized = {}
276 | for i, contents in enumerate(joined.values()):
277 | anonymized[f'{i:06d}'] = contents
278 |
279 | # Save annotations in release format
280 | with open(cache_directory / 'annotations.json', 'w') as file:
281 | json.dump(anonymized, file, sort_keys=True, indent=True)
282 |
283 | # Merge binary annotations to floats
284 | annotations = merge_annotations(anonymized)
285 |
286 | # Save dictionary containing annotation counts
287 | with open(cache_directory / 'counts.json', 'w') as file:
288 | json.dump(annotations['stems'], file, sort_keys=True, indent=True)
289 |
290 | # Get annotated stems
291 | stems = [
292 | file.replace('libritts-', '')
293 | for file in annotations['stems'].keys()]
294 |
295 | # Copy from LibriTTS cache to annotation cache
296 | for i, stem in enumerate(stems):
297 |
298 | # Get normalized scores
299 | count = annotations['stems'][stem]
300 | labels = [score / count for score in annotations['scores'][stem]]
301 |
302 | # Copy audio
303 | shutil.copyfile(
304 | emphases.CACHE_DIR / 'libritts' / 'audio' / f'{stem}.wav',
305 | emphases.CACHE_DIR / 'crowdsource' / 'audio' / f'{stem}.wav')
306 |
307 | # Copy alignment
308 | shutil.copyfile(
309 | emphases.CACHE_DIR / 'libritts' / 'alignment' / f'{stem}.TextGrid',
310 | emphases.CACHE_DIR / 'crowdsource' / 'alignment' / f'{stem}.TextGrid')
311 |
312 | # Load alignment
313 | alignment = pypar.Alignment(
314 | emphases.CACHE_DIR / 'crowdsource' / 'alignment' / f'{stem}.TextGrid')
315 |
316 | # Match alignment and scores (silences get a score of zero)
317 | j = 0
318 | scores = torch.zeros(len(alignment))
319 | for i, word in enumerate(alignment):
320 |
321 | # Keep silences as zero
322 | if str(word) == pypar.SILENCE:
323 | continue
324 |
325 | # Update scores
326 | scores[i] = float(labels[j])
327 |
328 | j += 1
329 |
330 | # Save scores
331 | torch.save(scores, cache_directory / 'scores' / f'{stem}.pt')
332 |
333 |
334 | def buckeye():
335 | """Download buckeye dataset"""
336 | # Extract tar file to data directory
337 | file = emphases.SOURCE_DIR / 'buckeye' / 'buckeye.tar.gz'
338 | with tarfile.open(file, 'r:gz') as tfile:
339 | tfile.extractall(emphases.DATA_DIR)
340 |
341 | # Setup cache directory
342 | cache_directory = emphases.CACHE_DIR / 'buckeye'
343 | cache_directory.mkdir(exist_ok=True, parents=True)
344 |
345 | # Create subdirectories
346 | features = ['alignment', 'audio', 'scores']
347 | for feature in features:
348 | (cache_directory / feature).mkdir(exist_ok=True, parents=True)
349 |
350 | # Copy alignments and filter out unused tokens
351 | data_directory = emphases.DATA_DIR / 'buckeye'
352 | alignment_files = (data_directory / 'alignment').glob('*.TextGrid')
353 | for file in alignment_files:
354 |
355 | # Load alignment
356 | alignment = pypar.Alignment(file)
357 |
358 | # Filter
359 | for word in alignment:
360 | if str(word) in BUCKEYE_FILTER_LIST:
361 | word.word = pypar.SILENCE
362 | word.phonemes = [
363 | pypar.Phoneme(pypar.SILENCE, word.start(), word.end())]
364 |
365 | # Deduplicate silence tokens
366 | i = 0
367 | words = alignment.words()
368 | prev_silence = False
369 | while i < len(words):
370 | word = words[i]
371 | if str(word) == pypar.SILENCE:
372 | if prev_silence:
373 | words[i - 1][-1]._end = word.end()
374 | del words[i]
375 | else:
376 | prev_silence = True
377 | i += 1
378 | else:
379 | prev_silence = False
380 | i += 1
381 |
382 | # Save alignment
383 | pypar.Alignment(words).save(
384 | cache_directory / 'alignment' / f'{file.stem}.TextGrid')
385 |
386 | # Get audio files
387 | audio_files = sorted((data_directory / 'audio').glob('*.wav'))
388 |
389 | # Resample audio
390 | for audio_file in audio_files:
391 |
392 | # Load and resample
393 | audio = emphases.load.audio(audio_file)
394 |
395 | # If audio is too quiet, increase the volume
396 | maximum = torch.abs(audio).max()
397 | if maximum < .35:
398 | audio *= .35 / maximum
399 |
400 | # Save to disk
401 | torchaudio.save(
402 | cache_directory / 'audio' / audio_file.name,
403 | audio,
404 | emphases.SAMPLE_RATE)
405 |
406 | # Read buckeye annotations
407 | data_directory = emphases.DATA_DIR / 'buckeye'
408 | with open(data_directory / 'annotations.csv') as file:
409 | reader = csv.DictReader(file)
410 | annotations = [row for row in reader]
411 |
412 | # Extract per-word emphasis scores
413 | alignment_files = (cache_directory / 'alignment').glob('*.TextGrid')
414 | for file in alignment_files:
415 |
416 | # Load alignment
417 | alignment = pypar.Alignment(file)
418 |
419 | # Get words from annotation
420 | words = [word for word in annotations if word['filename'] == file.stem]
421 | words = sorted(words, key=lambda x: float(x['wordmin']))
422 |
423 | # Get per-word emphasis scores
424 | j = 0
425 | scores = torch.zeros(len(alignment))
426 | for i, word in enumerate(alignment):
427 |
428 | # Keep silences as zero
429 | if str(word) == pypar.SILENCE:
430 | continue
431 |
432 | # Make sure alignments are aligned
433 | assert str(word).lower() == words[j]['word'].lower()
434 | assert (word.start() - float(words[j]['wordmin'])) < 1e-4
435 | assert (word.end() - float(words[j]['wordmax'])) < 1e-4
436 |
437 | # Update scores
438 | # pa.32 is the average of 32 human judgments of the perception of
439 | # prominence based on acoustic features
440 | scores[i] = float(words[j]['pa.32'])
441 |
442 | j += 1
443 |
444 | # Save scores
445 | torch.save(scores, cache_directory / 'scores' / f'{file.stem}.pt')
446 |
447 |
448 | def libritts():
449 | """Download libritts dataset"""
450 | # Setup source directory
451 | source_directory = emphases.SOURCE_DIR / 'libritts'
452 | source_directory.mkdir(exist_ok=True, parents=True)
453 |
454 | # Download
455 | url = 'https://us.openslr.org/resources/60/train-clean-100.tar.gz'
456 | file = source_directory / 'libritts-train-clean-100.tar.gz'
457 | torchutil.download.file(url, file)
458 |
459 | # Unzip
460 | with tarfile.open(file, 'r:gz') as tfile:
461 | tfile.extractall(emphases.DATA_DIR)
462 |
463 | # Rename folder
464 | directory = emphases.DATA_DIR / 'libritts'
465 | shutil.rmtree(directory, ignore_errors=True)
466 | shutil.move(emphases.DATA_DIR / 'LibriTTS', directory)
467 |
468 | # Download annotations from zenodo
469 | url = 'https://zenodo.org/records/10402793/files/libritts-emphasis-annotations.json?download=1'
470 | file = source_directory / 'annotations.json'
471 | torchutil.download.file(url, file)
472 |
473 | # Load annotations
474 | with open(source_directory / 'annotations.json') as file:
475 | annotations = json.load(file)
476 |
477 | # Merge annotations to floats
478 | annotations = merge_annotations(annotations)
479 |
480 | # Get list of audio files
481 | audio_files = list(directory.rglob('*.wav'))
482 | audio_files = [
483 | file for file in audio_files if file.stem in annotations['stems']]
484 |
485 | # Setup cache directory
486 | cache_directory = emphases.CACHE_DIR / 'libritts'
487 | cache_directory.mkdir(exist_ok=True, parents=True)
488 |
489 | # Create subdirectories
490 | features = ['alignment', 'audio', 'scores']
491 | for feature in features:
492 | (cache_directory / feature).mkdir(exist_ok=True, parents=True)
493 |
494 | # Iterate over files
495 | for audio_file in torchutil.iterator(
496 | audio_files,
497 | 'Formatting libritts',
498 | total=len(audio_files)
499 | ):
500 |
501 | # Load and resample audio
502 | audio = emphases.load.audio(audio_file)
503 |
504 | # If audio is too quiet, increase the volume
505 | maximum = torch.abs(audio).max()
506 | if maximum < .35:
507 | audio *= .35 / maximum
508 |
509 | # Save audio
510 | stem = audio_file.stem
511 | torchaudio.save(
512 | cache_directory / 'audio' / f'{stem}.wav',
513 | audio,
514 | emphases.SAMPLE_RATE)
515 |
516 | # Align text and audio
517 | text_files = [
518 | file.with_suffix('.normalized.txt') for file in audio_files]
519 | alignment_files = [
520 | cache_directory / 'alignment' / f'{file.stem}.TextGrid'
521 | for file in audio_files]
522 | pyfoal.from_files_to_files(
523 | text_files,
524 | audio_files,
525 | alignment_files,
526 | 'p2fa')
527 |
528 | for i, stem in enumerate([file.stem for file in audio_files]):
529 |
530 | # Load alignment
531 | alignment = pypar.Alignment(
532 | cache_directory / 'alignment' / f'{stem}.TextGrid')
533 |
534 | # Get ground truth
535 | count = annotations['stems'][stem]
536 | labels = [score / count for score in annotations['scores'][stem]]
537 |
538 | # Match alignment and scores (silences get a score of zero)
539 | j = 0
540 | scores = torch.zeros(len(alignment))
541 | for i, word in enumerate(alignment):
542 |
543 | # Keep silences as zero
544 | if str(word) == pypar.SILENCE:
545 | continue
546 |
547 | # Update scores
548 | scores[i] = float(labels[j])
549 |
550 | j += 1
551 |
552 | # Save scores
553 | torch.save(scores, cache_directory / 'scores' / f'{stem}.pt')
554 |
555 |
556 | ###############################################################################
557 | # Utilities
558 | ###############################################################################
559 |
560 |
561 | def download_file(url, file):
562 | """Download file from url"""
563 | with urllib.request.urlopen(url, context=ssl.SSLContext()) as response, \
564 | open(file, 'wb') as output:
565 | shutil.copyfileobj(response, output)
566 |
567 |
568 | def merge_annotations(annotations):
569 | """Merge crowdsourced annotations"""
570 | merged = {'samples': 0, 'scores': {}, 'stems': {}}
571 | for _, responses in annotations.items():
572 |
573 | # Iterate over stems
574 | for response in responses['annotations']:
575 | stem = response['stem']
576 | score = [float(c) for c in list(response['score'])]
577 |
578 | # Merge stem annotations
579 | if stem in merged['stems']:
580 |
581 | # Maybe cap the number of allowed annotations
582 | if (
583 | emphases.MAX_ANNOTATIONS is not None and
584 | merged['stems'][stem] == emphases.MAX_ANNOTATIONS
585 | ):
586 | continue
587 |
588 | # Update sums and counts
589 | for i in range(len(score)):
590 | merged['scores'][stem][i] += score[i]
591 | merged['stems'][stem] += 1
592 |
593 | # Add new stem
594 | else:
595 | merged['scores'][stem] = score
596 | merged['stems'][stem] = 1
597 |
598 | # Update total number of samples
599 | merged['samples'] += 1
600 |
601 | # Maybe cap the minimum required annotations
602 | if emphases.MIN_ANNOTATIONS is not None:
603 | merged['stems'] = {
604 | stem: count for stem, count in merged['stems'].items()
605 | if count == emphases.MIN_ANNOTATIONS}
606 | merged['scores'] = {
607 | stem: scores for stem, scores in merged['scores'].items()
608 | if stem in merged['stems']}
609 |
610 | return merged
611 |
--------------------------------------------------------------------------------
/emphases/data/loader.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Dataloader
8 | ###############################################################################
9 |
10 |
11 | def loader(dataset, partition=None, gpu=None):
12 | """Retrieve a data loader"""
13 | # Get dataset
14 | dataset = emphases.data.Dataset(dataset, partition)
15 |
16 | # Get sampler
17 | sampler = emphases.data.sampler(dataset, partition)
18 |
19 | # Create loader
20 | return torch.utils.data.DataLoader(
21 | dataset,
22 | num_workers=emphases.NUM_WORKERS,
23 | pin_memory=gpu is not None,
24 | collate_fn=emphases.data.collate,
25 | batch_sampler=sampler)
26 |
--------------------------------------------------------------------------------
/emphases/data/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from . import mels
3 | from . import loudness
4 |
--------------------------------------------------------------------------------
/emphases/data/preprocess/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Entry point
8 | ###############################################################################
9 |
10 |
11 | def parse_args():
12 | """Parse command-line arguments"""
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument(
15 | '--datasets',
16 | nargs='+',
17 | default=emphases.DATASETS,
18 | help='The datasets to preprocess')
19 | parser.add_argument(
20 | '--gpu',
21 | type=int,
22 | help='The index of the gpu to run inference on')
23 | return parser.parse_known_args()[0]
24 |
25 |
26 | if __name__ == '__main__':
27 | emphases.data.preprocess.datasets(**vars(parse_args()))
28 |
--------------------------------------------------------------------------------
/emphases/data/preprocess/core.py:
--------------------------------------------------------------------------------
1 | import penn
2 | import torch
3 | import torchutil
4 |
5 | import emphases
6 |
7 |
8 | ###############################################################################
9 | # Preprocess
10 | ###############################################################################
11 |
12 |
13 | @torchutil.notify('preprocess')
14 | def datasets(datasets, gpu=None):
15 | """Preprocess datasets"""
16 | for dataset in datasets:
17 | cache_directory = emphases.CACHE_DIR / dataset
18 |
19 | # Get audio files, from cache
20 | audio_files = sorted(cache_directory.rglob('*.wav'))
21 |
22 | # Preprocess mels
23 | mel_files = [
24 | cache_directory / 'mels' / f'{file.stem}.pt'
25 | for file in audio_files]
26 | emphases.data.preprocess.mels.from_files_to_files(
27 | audio_files,
28 | mel_files)
29 |
30 | # Preprocess loudness
31 | loudness_files = [
32 | cache_directory / 'loudness' / f'{file.stem}.pt'
33 | for file in audio_files]
34 | emphases.data.preprocess.loudness.from_files_to_files(
35 | audio_files,
36 | loudness_files)
37 |
38 | # Preprocess pitch, periodicity
39 | (cache_directory / 'pitch').mkdir(exist_ok=True, parents=True)
40 | pitch_files = [
41 | cache_directory / 'pitch' / f'{file.stem}'
42 | for file in audio_files]
43 | penn.from_files_to_files(
44 | audio_files,
45 | pitch_files,
46 | hopsize=emphases.convert.samples_to_seconds(emphases.HOPSIZE),
47 | fmin=emphases.FMIN,
48 | fmax=emphases.FMAX,
49 | batch_size=2048,
50 | center='half-hop',
51 | interp_unvoiced_at=emphases.VOICED_THRESHOLD,
52 | num_workers=emphases.NUM_WORKERS,
53 | gpu=gpu)
54 |
55 | # Pitch and periodicity use floating-point hopsize, while mels and
56 | # loudness use an integer hopsize in samples. This results in
57 | # single-frame differences when the audio length is within one sample
58 | # of a new frame due to floating-point error. We simply remove the last
59 | # frame in this rare case.
60 | for loudness_file, pitch_file in zip(loudness_files, pitch_files):
61 | loudness = torch.load(loudness_file)
62 | pitch = torch.load(f'{pitch_file}-pitch.pt')
63 | periodicity = torch.load(f'{pitch_file}-periodicity.pt')
64 | if pitch.shape[1] == loudness.shape[1] + 1:
65 | pitch = pitch[:, :-1]
66 | periodicity = periodicity[:, :-1]
67 | torch.save(pitch, f'{pitch_file}-pitch.pt')
68 | torch.save(periodicity, f'{pitch_file}-periodicity.pt')
69 |
70 |
71 | def from_audio(audio, gpu=None):
72 | """Preprocess one audio file"""
73 | # Move to device (no-op if devices are the same)
74 | audio = audio.to('cpu' if gpu is None else f'cuda:{gpu}')
75 |
76 | features = []
77 |
78 | # Preprocess mels
79 | if emphases.MEL_FEATURE:
80 | features.append(emphases.data.preprocess.mels.from_audio(audio))
81 |
82 | # Preprocess pitch and periodicity
83 | if emphases.PITCH_FEATURE or emphases.PERIODICITY_FEATURE:
84 | pitch, periodicity = penn.from_audio(
85 | audio,
86 | emphases.SAMPLE_RATE,
87 | hopsize=emphases.convert.samples_to_seconds(emphases.HOPSIZE),
88 | fmin=emphases.FMIN,
89 | fmax=emphases.FMAX,
90 | pad=True,
91 | interp_unvoiced_at=emphases.VOICED_THRESHOLD,
92 | gpu=gpu)
93 |
94 | if emphases.PITCH_FEATURE:
95 | if emphases.NORMALIZE:
96 | features.append(
97 | (torch.log2(pitch) - emphases.LOGFMIN) /
98 | (emphases.LOGFMAX - emphases.LOGFMIN))
99 | else:
100 | features.append(torch.log2(pitch))
101 |
102 | if emphases.PERIODICITY_FEATURE:
103 | features.append(periodicity)
104 |
105 | # Pitch and periodicity use floating-point hopsize, while mels and
106 | # loudness use an integer hopsize in samples. This results in
107 | # single-frame differences when the audio length is within one sample
108 | # of a new frame due to floating-point error. We simply repeat the last
109 | # frame in this rare case.
110 | frames = emphases.convert.samples_to_frames(audio.shape[-1])
111 | if pitch.shape[1] == frames + 1:
112 | pitch = pitch[:, :-1]
113 | periodicity = periodicity[:, :-1]
114 |
115 | # Preprocess loudness
116 | if emphases.LOUDNESS_FEATURE:
117 | loudness = emphases.data.preprocess.loudness.from_audio(
118 | audio,
119 | emphases.SAMPLE_RATE)
120 | features.append(loudness.to(audio.device))
121 |
122 | # Concatenate features
123 | features = features[0] if len(features) == 1 else torch.cat(features)
124 |
125 | return features[None]
126 |
--------------------------------------------------------------------------------
/emphases/data/preprocess/loudness.py:
--------------------------------------------------------------------------------
1 | import multiprocessing as mp
2 | import warnings
3 |
4 | import librosa
5 | import numpy as np
6 | import penn
7 | import torch
8 | import torchutil
9 |
10 | import emphases
11 |
12 |
13 | ###############################################################################
14 | # Interface
15 | ###############################################################################
16 |
17 |
18 | def from_audio(audio, sample_rate=emphases.SAMPLE_RATE):
19 | """Compute mels from audio"""
20 | # Mayble resample
21 | audio = emphases.resample(audio, sample_rate)
22 |
23 | # Compute loudness
24 | return a_weighted(audio, sample_rate, hop_length=emphases.HOPSIZE)
25 |
26 |
27 | def from_file(audio_file):
28 | """Load audio and compute mels"""
29 | audio = emphases.load.audio(audio_file)
30 |
31 | # Compute loudness
32 | return from_audio(audio)
33 |
34 |
35 | def from_file_to_file(audio_file, output_file):
36 | """Compute loudness from audio file and save to disk"""
37 | loudness = from_file(audio_file)
38 |
39 | # Save to disk
40 | output_file.parent.mkdir(exist_ok=True, parents=True)
41 | torch.save(loudness, output_file)
42 |
43 |
44 | def from_files_to_files(audio_files, output_files):
45 | """Compute loudness for many files and save to disk"""
46 | torchutil.multiprocess_iterator(
47 | wrapper,
48 | zip(audio_files, output_files),
49 | 'Preprocessing a-weighted loudness',
50 | total=len(audio_files),
51 | num_workers=emphases.NUM_WORKERS)
52 |
53 |
54 | ###############################################################################
55 | # Loudness
56 | ###############################################################################
57 |
58 |
59 | def a_weighted(audio, sample_rate, hop_length=None, pad=False):
60 | """Retrieve the per-frame loudness"""
61 | # Save device
62 | device = audio.device
63 |
64 | # Default hop length of 10 ms
65 | hop_length = sample_rate // 100 if hop_length is None else hop_length
66 |
67 | if audio.dim() == 2:
68 | audio = audio[:, None, :]
69 | elif audio.dim() == 1:
70 | audio = audio[None, None, :]
71 |
72 | # Pad audio
73 | p = (emphases.NUM_FFT - emphases.HOPSIZE) // 2
74 | audio = torch.nn.functional.pad(audio, (p, p), "reflect").squeeze(1)
75 |
76 | # Convert to numpy
77 | audio = audio.detach().cpu().numpy().squeeze(0)
78 |
79 | # Cache weights
80 | if not hasattr(a_weighted, 'weights'):
81 | a_weighted.weights = perceptual_weights()
82 |
83 | # Take stft
84 | stft = librosa.stft(
85 | audio,
86 | n_fft=penn.WINDOW_SIZE,
87 | hop_length=hop_length,
88 | win_length=penn.WINDOW_SIZE,
89 | center=pad,
90 | pad_mode='constant')
91 |
92 | # Compute magnitude on db scale
93 | db = librosa.amplitude_to_db(np.abs(stft))
94 |
95 | # Apply A-weighting
96 | weighted = db + a_weighted.weights
97 |
98 | # Threshold
99 | weighted[weighted < emphases.MIN_DB] = emphases.MIN_DB
100 |
101 | # Average over weighted frequencies
102 | loudness = torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None]
103 |
104 | # Scale to roughly [0, 1]
105 | if emphases.NORMALIZE:
106 | return (loudness + 100.) / 100.
107 | return loudness
108 |
109 |
110 | def perceptual_weights():
111 | """A-weighted frequency-dependent perceptual loudness weights"""
112 | frequencies = librosa.fft_frequencies(
113 | sr=penn.SAMPLE_RATE,
114 | n_fft=penn.WINDOW_SIZE)
115 |
116 | # A warning is raised for nearly inaudible frequencies, but it ends up
117 | # defaulting to -100 db. That default is fine for our purposes.
118 | with warnings.catch_warnings():
119 | warnings.simplefilter('ignore', RuntimeWarning)
120 | return librosa.A_weighting(frequencies)[:, None] - emphases.REF_DB
121 |
122 | def wrapper(item):
123 | """Multiprocessing wrapper"""
124 | from_file_to_file(*item)
125 |
--------------------------------------------------------------------------------
/emphases/data/preprocess/mels.py:
--------------------------------------------------------------------------------
1 | import multiprocessing as mp
2 | import os
3 |
4 | import librosa
5 | import torch
6 | import torchutil
7 |
8 | import emphases
9 |
10 |
11 | ###############################################################################
12 | # Mel spectrogram
13 | ###############################################################################
14 |
15 |
16 | def from_audio(audio):
17 | """Compute spectrogram from audio"""
18 | # Cache hann window
19 | if (
20 | not hasattr(from_audio, 'window') or
21 | from_audio.dtype != audio.dtype or
22 | from_audio.device != audio.device
23 | ):
24 | from_audio.window = torch.hann_window(
25 | emphases.WINDOW_SIZE,
26 | dtype=audio.dtype,
27 | device=audio.device)
28 | from_audio.dtype = audio.dtype
29 | from_audio.device = audio.device
30 |
31 | # Pad audio
32 | size = (emphases.NUM_FFT - emphases.HOPSIZE) // 2
33 | audio = torch.nn.functional.pad(
34 | audio,
35 | (size, size),
36 | mode='reflect')
37 |
38 | # Compute stft
39 | stft = torch.stft(
40 | audio.squeeze(1),
41 | emphases.NUM_FFT,
42 | hop_length=emphases.HOPSIZE,
43 | window=from_audio.window,
44 | center=False,
45 | normalized=False,
46 | onesided=True,
47 | return_complex=True)
48 | stft = torch.view_as_real(stft)[0]
49 |
50 | # Compute magnitude
51 | spectrogram = torch.sqrt(stft.pow(2).sum(-1) + 1e-6)
52 |
53 | # Convert to mels
54 | mels = linear_to_mel(spectrogram)
55 |
56 | # Scale to roughly [0, 1]
57 | if emphases.NORMALIZE:
58 | return (mels + 10.) / 10.
59 | return mels
60 |
61 |
62 | def from_file(audio_file):
63 | """Load audio and compute mels"""
64 | audio = emphases.load.audio(audio_file)
65 |
66 | # Compute mels
67 | return from_audio(audio)
68 |
69 |
70 | def from_file_to_file(audio_file, output_file):
71 | """Compute mels from audio file and save to disk"""
72 | mels = from_file(audio_file)
73 |
74 | # Save to disk
75 | output_file.parent.mkdir(exist_ok=True, parents=True)
76 | torch.save(mels, output_file)
77 |
78 |
79 | def from_files_to_files(audio_files, output_files):
80 | """Compute mels for many files and save to disk"""
81 | torchutil.multiprocess_iterator(
82 | wrapper,
83 | zip(audio_files, output_files),
84 | 'Preprocessing mels',
85 | total=len(audio_files),
86 | num_workers=emphases.NUM_WORKERS)
87 |
88 |
89 | ###############################################################################
90 | # Utilities
91 | ###############################################################################
92 |
93 |
94 | def linear_to_mel(spectrogram):
95 | # Create mel basis
96 | if not hasattr(linear_to_mel, 'mel_basis'):
97 | basis = librosa.filters.mel(
98 | sr=emphases.SAMPLE_RATE,
99 | n_fft=emphases.NUM_FFT,
100 | n_mels=emphases.NUM_MELS)
101 | basis = torch.from_numpy(basis)
102 | basis = basis.to(spectrogram.dtype).to(spectrogram.device)
103 | linear_to_mel.basis = basis
104 |
105 | # Convert to mels
106 | melspectrogram = torch.matmul(linear_to_mel.basis, spectrogram)
107 |
108 | # Apply dynamic range compression
109 | return torch.log(torch.clamp(melspectrogram, min=1e-5))
110 |
111 | def wrapper(item):
112 | """Multiprocessing wrapper"""
113 | from_file_to_file(*item)
114 |
--------------------------------------------------------------------------------
/emphases/data/sampler.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Sampler selection
8 | ###############################################################################
9 |
10 |
11 | def sampler(dataset, partition):
12 | """Create batch sampler"""
13 | # Deterministic random sampler for training
14 | if partition in ['train', 'valid']:
15 | return Sampler(dataset)
16 |
17 | # Sample validation and test data sequentially
18 | elif partition.startswith('test'):
19 | return torch.utils.data.BatchSampler(
20 | torch.utils.data.SequentialSampler(dataset),
21 | 1,
22 | False)
23 |
24 | else:
25 | raise ValueError(f'Partition {partition} is not defined')
26 |
27 |
28 | ###############################################################################
29 | # Samplers
30 | ###############################################################################
31 |
32 |
33 | class Sampler:
34 |
35 | def __init__(self, dataset, max_frames=emphases.MAX_TRAINING_FRAMES):
36 | self.max_frames = max_frames
37 | self.epoch = 0
38 | self.length = len(dataset)
39 | self.buckets = dataset.buckets()
40 |
41 | def __iter__(self):
42 | return iter(self.batch())
43 |
44 | def __len__(self):
45 | return len(self.batch())
46 |
47 | def batch(self):
48 | """Produces batch indices for one epoch"""
49 | # Deterministic shuffling based on epoch
50 | generator = torch.Generator()
51 | generator.manual_seed(emphases.RANDOM_SEED + self.epoch)
52 |
53 | # Iterate over length-partitioned buckets
54 | batches = []
55 | for bucket in self.buckets:
56 |
57 | # Shuffle bucket
58 | bucket = bucket[
59 | torch.randperm(len(bucket), generator=generator).tolist()]
60 |
61 | # Variable batch size
62 | batch = []
63 | max_length = 0
64 | for index, length in bucket:
65 | max_length = max(max_length, length)
66 | if (
67 | batch and
68 | (len(batch) + 1) * max_length > self.max_frames
69 | ):
70 | batches.append(batch)
71 | max_length = length
72 | batch = [index]
73 | else:
74 | batch.append(index)
75 |
76 | # Don't drop last batch
77 | if batch:
78 | batches.append(batch)
79 |
80 | # Shuffle
81 | return [
82 | batches[i] for i in
83 | torch.randperm(len(batches), generator=generator).tolist()]
84 |
85 | def set_epoch(self, epoch):
86 | self.epoch = epoch
87 |
--------------------------------------------------------------------------------
/emphases/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from . import metrics
3 | from .metrics import Metrics
4 |
--------------------------------------------------------------------------------
/emphases/evaluate/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from pathlib import Path
3 |
4 | import emphases
5 |
6 |
7 | ###############################################################################
8 | # Entry point
9 | ###############################################################################
10 |
11 |
12 | def parse_args():
13 | """Parse command-line arguments"""
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument(
16 | '--datasets',
17 | nargs='+',
18 | default=emphases.EVALUATION_DATASETS,
19 | help='The datasets to evaluate')
20 | parser.add_argument(
21 | '--checkpoint',
22 | type=Path,
23 | help='The checkpoint file to evaluate')
24 | parser.add_argument(
25 | '--gpu',
26 | type=int,
27 | help='The index of the GPU to use for evaluation')
28 |
29 | return parser.parse_known_args()[0]
30 |
31 |
32 | if __name__ == '__main__':
33 | emphases.evaluate.datasets(**vars(parse_args()))
34 |
--------------------------------------------------------------------------------
/emphases/evaluate/core.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import torch
4 | import torchutil
5 |
6 | import emphases
7 |
8 |
9 | ###############################################################################
10 | # Evaluate
11 | ###############################################################################
12 |
13 |
14 | @torchutil.notify('evaluate')
15 | def datasets(datasets, checkpoint=None, gpu=None):
16 | """Perform evaluation"""
17 | device = torch.device('cpu' if gpu is None else f'cuda:{gpu}')
18 |
19 | # Containers for results
20 | overall, granular = {}, {}
21 |
22 | # Evaluate each dataset
23 | for dataset in datasets:
24 |
25 | # Get data loader
26 | loader = emphases.data.loader(dataset, 'test', gpu)
27 |
28 | # Get mean and variance for Pearson Correlation
29 | target_stats = emphases.evaluate.metrics.Statistics()
30 | predicted_stats = emphases.evaluate.metrics.Statistics()
31 | for batch in loader:
32 |
33 | # Unpack
34 | _, _, _, word_lengths, targets, alignments, audio, _ = batch
35 |
36 | # Get predicted scores
37 | scores = emphases.from_alignment_and_audio(
38 | alignments[0],
39 | audio[0],
40 | emphases.SAMPLE_RATE,
41 | checkpoint=checkpoint,
42 | gpu=gpu)
43 |
44 | # Update statistics
45 | target_stats.update(targets, word_lengths)
46 | predicted_stats.update(scores[None], word_lengths)
47 |
48 | # Get metric class
49 | metric_fn = emphases.evaluate.Metrics
50 |
51 | # Per-file metrics
52 | file_metrics = metric_fn(predicted_stats, target_stats)
53 |
54 | # Per-dataset metrics
55 | dataset_metrics = metric_fn(predicted_stats, target_stats)
56 |
57 | # Iterate over test set
58 | for batch in torchutil.iterator(
59 | loader,
60 | f'Evaluating {emphases.CONFIG} on {dataset}',
61 | total=len(loader)
62 | ):
63 |
64 | # Unpack
65 | (
66 | _,
67 | frame_lengths,
68 | word_bounds,
69 | word_lengths,
70 | targets,
71 | alignments,
72 | audio,
73 | stems
74 | ) = batch
75 |
76 | # Reset file metrics
77 | file_metrics.reset()
78 |
79 | if emphases.METHOD == 'neural':
80 |
81 | # Get predicted scores
82 | scores = []
83 |
84 | # Preprocess audio
85 | for features, word_bounds in emphases.preprocess(
86 | alignments[0],
87 | audio[0],
88 | gpu=gpu
89 | ):
90 |
91 | # Infer
92 | logits = emphases.infer(
93 | features,
94 | word_bounds,
95 | checkpoint).detach()
96 |
97 | # Skip postprocessing
98 | scores.append(logits)
99 |
100 | # Concatenate results
101 | scores = torch.cat(scores, 2)
102 |
103 | else:
104 |
105 | # Baseline method inference
106 | scores = emphases.from_alignment_and_audio(
107 | alignments[0],
108 | audio[0],
109 | emphases.SAMPLE_RATE,
110 | gpu=gpu)[None]
111 |
112 | # Update metrics
113 | args = (scores, targets.to(device), word_lengths.to(device))
114 | file_metrics.update(*args)
115 | dataset_metrics.update(*args)
116 |
117 | # Copy results
118 | granular[f'{dataset}/{stems[0]}'] = file_metrics()
119 | overall[dataset] = dataset_metrics()
120 |
121 | # Write to json files
122 | directory = emphases.EVAL_DIR / emphases.CONFIG
123 | directory.mkdir(exist_ok=True, parents=True)
124 | with open(directory / 'overall.json', 'w') as file:
125 | json.dump(overall, file, indent=4)
126 | with open(directory / 'granular.json', 'w') as file:
127 | json.dump(granular, file, indent=4)
128 |
--------------------------------------------------------------------------------
/emphases/evaluate/metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchutil
3 |
4 | import emphases
5 |
6 |
7 | ###############################################################################
8 | # Aggregate metric
9 | ###############################################################################
10 |
11 |
12 | class Metrics:
13 |
14 | def __init__(self, predicted_stats, target_stats):
15 | self.correlation = torchutil.metrics.PearsonCorrelation(
16 | *predicted_stats(),
17 | *target_stats())
18 | self.bce = BinaryCrossEntropy()
19 | self.mse = MeanSquaredError()
20 |
21 | def __call__(self):
22 | return {
23 | 'pearson_correlation': self.correlation(),
24 | 'bce': self.bce(),
25 | 'mse': self.mse()}
26 |
27 | def update(
28 | self,
29 | logits,
30 | targets,
31 | word_lengths):
32 | # Detach from graph
33 | logits = logits.detach()
34 |
35 | # Word resolution sequence mask
36 | mask = emphases.model.mask_from_lengths(word_lengths)
37 | logits, targets = logits[mask], targets[mask]
38 |
39 | # Update cross entropy
40 | self.bce.update(logits, targets)
41 |
42 | # Update squared error
43 | self.mse.update(emphases.postprocess(logits), targets)
44 |
45 | # Update pearson correlation
46 | self.correlation.update(emphases.postprocess(logits), targets)
47 |
48 | def reset(self):
49 | self.correlation.reset()
50 | self.bce.reset()
51 | self.mse.reset()
52 |
53 |
54 | ###############################################################################
55 | # Individual metrics
56 | ###############################################################################
57 |
58 |
59 | class BinaryCrossEntropy(torchutil.metrics.Average):
60 |
61 | def update(self, scores, targets):
62 | if emphases.LOSS == 'bce':
63 |
64 | # Get values from logits
65 | values = torch.nn.functional.binary_cross_entropy_with_logits(
66 | scores,
67 | targets,
68 | reduction='none')
69 |
70 | else:
71 |
72 | # Get values from probabilities
73 | x, y = torch.clamp(scores, 0., 1.), targets
74 | values = -(
75 | y * torch.log(x + 1e-6) + (1 - y) * torch.log(1 - x + 1e-6))
76 |
77 | # Update
78 | super().update(values, values.numel())
79 |
80 |
81 | # TODO - fix scaling
82 | class MeanSquaredError(torchutil.metrics.Average):
83 |
84 | def update(
85 | self,
86 | scores,
87 | targets):
88 | # Compute sum of MSE
89 | values = torch.nn.functional.mse_loss(
90 | scores,
91 | targets,
92 | reduction='none')
93 |
94 | # Update
95 | super().update(values, values.numel())
96 |
97 |
98 | ###############################################################################
99 | # Utilities
100 | ###############################################################################
101 |
102 |
103 | class Statistics(torchutil.metrics.MeanStd):
104 |
105 | def update(self, values, lengths):
106 | # Sequence mask
107 | mask = emphases.model.mask_from_lengths(lengths)
108 |
109 | # Update
110 | super().update(values[mask].flatten().tolist())
111 |
--------------------------------------------------------------------------------
/emphases/load.py:
--------------------------------------------------------------------------------
1 | import torchaudio
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Loading utilities
8 | ###############################################################################
9 |
10 |
11 | def audio(file):
12 | """Load audio and maybe resample"""
13 | # Load
14 | audio, sample_rate = torchaudio.load(file)
15 |
16 | # Maybe resample
17 | return emphases.resample(audio, sample_rate)
18 |
--------------------------------------------------------------------------------
/emphases/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from .layers import Layers
3 |
4 | import emphases
5 |
--------------------------------------------------------------------------------
/emphases/model/core.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Model definition
8 | ###############################################################################
9 |
10 |
11 | class Model(torch.nn.Module):
12 |
13 | def __init__(self):
14 | super().__init__()
15 |
16 | # Input projection
17 | self.input_layer = torch.nn.Conv1d(
18 | emphases.NUM_FEATURES,
19 | emphases.CHANNELS,
20 | kernel_size=emphases.ENCODER_KERNEL_SIZE,
21 | padding='same')
22 |
23 | # Frame encoder
24 | self.frame_encoder = emphases.model.Layers(
25 | kernel_size=emphases.ENCODER_KERNEL_SIZE)
26 |
27 | # If we are resampling within the model, initialize word decoder
28 | if emphases.DOWNSAMPLE_LOCATION in ['input', 'intermediate']:
29 | self.word_decoder = emphases.model.Layers(
30 | kernel_size=emphases.DECODER_KERNEL_SIZE)
31 |
32 | # Output projection
33 | self.output_layer = torch.nn.Conv1d(
34 | emphases.CHANNELS,
35 | 1,
36 | kernel_size=emphases.DECODER_KERNEL_SIZE,
37 | padding='same')
38 |
39 | def forward(self, features, frame_lengths, word_bounds, word_lengths):
40 |
41 | if emphases.DOWNSAMPLE_LOCATION == 'input':
42 |
43 | # Segment acoustic features into word segments
44 | segments, bounds, lengths = emphases.segment(
45 | features,
46 | word_bounds,
47 | word_lengths)
48 |
49 | # Embed frames
50 | frame_embeddings = self.frame_encoder(
51 | self.input_layer(segments),
52 | lengths)
53 |
54 | # Downsample
55 | if emphases.DOWNSAMPLE_METHOD == 'average':
56 | word_embeddings = frame_embeddings.mean(dim=2, keepdim=True)
57 | elif emphases.DOWNSAMPLE_METHOD == 'max':
58 | word_embeddings = frame_embeddings.max(
59 | dim=2,
60 | keepdim=True
61 | ).values
62 | elif emphases.DOWNSAMPLE_METHOD == 'sum':
63 | word_embeddings = frame_embeddings.sum(dim=2, keepdim=True)
64 | elif emphases.DOWNSAMPLE_METHOD == 'center':
65 | word_embeddings = emphases.downsample(
66 | frame_embeddings,
67 | bounds,
68 | torch.ones(
69 | (len(lengths),),
70 | dtype=torch.long,
71 | device=lengths.device))
72 | else:
73 | raise ValueError(
74 | f'Interpolation method {emphases.DOWNSAMPLE_METHOD} is not defined')
75 |
76 | # Stitch together word segment embeddings
77 | mask = mask_from_lengths(word_lengths)
78 | word_embeddings = word_embeddings.squeeze(2).transpose(0, 1).reshape(
79 | word_embeddings.shape[1],
80 | word_bounds.shape[0],
81 | word_bounds.shape[2]
82 | ).permute(1, 0, 2) * mask
83 |
84 | # Decode
85 | word_embeddings = self.word_decoder(
86 | word_embeddings,
87 | word_lengths)
88 |
89 | else:
90 |
91 | # Embed frames
92 | frame_embeddings = self.frame_encoder(
93 | self.input_layer(features),
94 | frame_lengths)
95 |
96 | if emphases.DOWNSAMPLE_LOCATION == 'intermediate':
97 |
98 | # Downsample activations to word resolution
99 | word_embeddings = emphases.downsample(
100 | frame_embeddings,
101 | word_bounds,
102 | word_lengths)
103 |
104 | # Infer emphasis scores from word embeddings
105 | word_embeddings = self.word_decoder(
106 | word_embeddings,
107 | word_lengths)
108 |
109 | elif emphases.DOWNSAMPLE_LOCATION == 'loss':
110 |
111 | # Downsample activations to word resolution
112 | word_embeddings = emphases.downsample(
113 | frame_embeddings,
114 | word_bounds,
115 | word_lengths)
116 |
117 | elif emphases.DOWNSAMPLE_LOCATION == 'inference':
118 |
119 | if self.training:
120 |
121 | # Return frame resolution prominence for framewise loss
122 | return self.output_layer(frame_embeddings)
123 |
124 | else:
125 |
126 | # Downsample activations to word resolution
127 | word_embeddings = emphases.downsample(
128 | frame_embeddings,
129 | word_bounds,
130 | word_lengths)
131 |
132 | else:
133 | raise ValueError(
134 | f'Downsample location {emphases.DOWNSAMPLE_LOCATION} ' +
135 | 'not recognized')
136 |
137 | # Project to scalar
138 | return self.output_layer(word_embeddings)
139 |
140 |
141 | ###############################################################################
142 | # Utilities
143 | ###############################################################################
144 |
145 |
146 | def mask_from_lengths(lengths):
147 | """Create boolean mask from sequence lengths"""
148 | x = torch.arange(lengths.max(), dtype=lengths.dtype, device=lengths.device)
149 | return (x.unsqueeze(0) < lengths.unsqueeze(1)).unsqueeze(1)
150 |
--------------------------------------------------------------------------------
/emphases/model/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .convolution import Convolution
2 | from .transformer import Transformer
3 |
4 | import emphases
5 |
6 |
7 | def Layers(**kwargs):
8 | if emphases.ARCHITECTURE == 'convolution':
9 | return Convolution(**kwargs)
10 | elif emphases.ARCHITECTURE == 'transformer':
11 | return Transformer()
12 | else:
13 | raise ValueError(
14 | f'Network layer {emphases.ARCHITECTURE} is not defined')
15 |
--------------------------------------------------------------------------------
/emphases/model/layers/convolution.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | import torch
4 |
5 | import emphases
6 |
7 |
8 | ###############################################################################
9 | # Convolution model
10 | ###############################################################################
11 |
12 |
13 | class Convolution(torch.nn.Sequential):
14 |
15 | def __init__(self, kernel_size=emphases.ENCODER_KERNEL_SIZE):
16 | # Bind common parameters
17 | conv_fn = functools.partial(
18 | torch.nn.Conv1d,
19 | kernel_size=kernel_size,
20 | padding='same')
21 |
22 | # Layers
23 | layers = []
24 | channels = emphases.CHANNELS
25 | for _ in range(emphases.LAYERS):
26 | layers.extend((
27 | conv_fn(channels, channels),
28 | emphases.ACTIVATION_FUNCTION()))
29 | if emphases.DROPOUT is not None:
30 | layers.append(torch.nn.Dropout(emphases.DROPOUT))
31 |
32 | # Register to Module
33 | super().__init__(*layers)
34 |
35 | # Ignore sequence length parameter needed for Transformer model
36 | def forward(self, x, _):
37 | return super().forward(x)
38 |
--------------------------------------------------------------------------------
/emphases/model/layers/transformer.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 |
5 | import emphases
6 |
7 |
8 | ###############################################################################
9 | # Transformer stack
10 | ###############################################################################
11 |
12 |
13 | class Transformer(torch.nn.Module):
14 |
15 | def __init__(self, num_layers=emphases.LAYERS, channels=emphases.CHANNELS):
16 | super().__init__()
17 | self.position = PositionalEncoding(channels, .1)
18 | self.model = torch.nn.TransformerEncoder(
19 | torch.nn.TransformerEncoderLayer(
20 | channels,
21 | 2,
22 | dim_feedforward=emphases.CHANNELS),
23 | num_layers)
24 |
25 | def forward(self, x, lengths):
26 | mask = emphases.model.mask_from_lengths(lengths)
27 | return self.model(
28 | self.position(x.permute(2, 0, 1)),
29 | src_key_padding_mask=~mask.squeeze(1)
30 | ).permute(1, 2, 0)
31 |
32 |
33 | ###############################################################################
34 | # Utilities
35 | ###############################################################################
36 |
37 |
38 | class PositionalEncoding(torch.nn.Module):
39 |
40 | def __init__(self, channels, dropout=.1, max_len=5000):
41 | super().__init__()
42 | self.dropout = torch.nn.Dropout(p=dropout)
43 | index = torch.arange(max_len).unsqueeze(1)
44 | frequency = torch.exp(
45 | torch.arange(0, channels, 2) * (-math.log(10000.0) / channels))
46 | encoding = torch.zeros(max_len, 1, channels)
47 | encoding[:, 0, 0::2] = torch.sin(index * frequency)
48 | encoding[:, 0, 1::2] = torch.cos(index * frequency)
49 | self.register_buffer('encoding', encoding)
50 |
51 | def forward(self, x):
52 | return self.dropout(x + self.encoding[:x.size(0)])
53 |
--------------------------------------------------------------------------------
/emphases/partition/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 |
--------------------------------------------------------------------------------
/emphases/partition/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import emphases
4 |
5 |
6 | def parse_args():
7 | """Parse command-line arguments"""
8 | parser = argparse.ArgumentParser(description='Partition datasets')
9 | parser.add_argument(
10 | '--datasets',
11 | nargs='+',
12 | default=emphases.DATASETS,
13 | help='The datasets to partition')
14 | return parser.parse_known_args()[0]
15 |
16 |
17 | if __name__ == '__main__':
18 | emphases.partition.datasets(**vars(parse_args()))
19 |
--------------------------------------------------------------------------------
/emphases/partition/core.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import emphases
4 |
5 |
6 | ###############################################################################
7 | # Partition dataset
8 | ###############################################################################
9 |
10 |
11 | def datasets(datasets):
12 | """Partition datasets"""
13 | for dataset in datasets:
14 |
15 | # Check if partition already exists
16 | file = emphases.PARTITION_DIR / f'{dataset}.json'
17 |
18 | # Random seed
19 | random.seed(emphases.RANDOM_SEED)
20 |
21 | # Make partition
22 | if dataset == 'automatic':
23 | partition = automatic()
24 | elif dataset == 'buckeye':
25 | partition = buckeye()
26 | elif dataset == 'libritts':
27 | partition = libritts()
28 | elif dataset == 'crowdsource':
29 | partition = crowdsource()
30 | else:
31 | raise ValueError(f'Dataset {dataset} is not defined')
32 |
33 | # Save to disk
34 | file.parent.mkdir(exist_ok=True, parents=True)
35 | with open(file, 'w') as file:
36 | json.dump(partition, file, ensure_ascii=False, indent=4)
37 |
38 |
39 | ###############################################################################
40 | # Existing datasets
41 | ###############################################################################
42 |
43 |
44 | def buckeye():
45 | """Partition buckeye dataset"""
46 | # Get audio files
47 | directory = emphases.CACHE_DIR / 'buckeye'
48 | audio_files = directory.rglob('*.wav')
49 |
50 | # Get stems
51 | stems = [file.stem for file in audio_files]
52 |
53 | # Partition
54 | return {'train': [], 'valid': [], 'test': stems}
55 |
56 |
57 | def libritts():
58 | """Partition libritts dataset"""
59 | # Get audio files
60 | directory = emphases.CACHE_DIR / 'libritts'
61 | audio_files = directory.rglob('*.wav')
62 |
63 | # Get stems
64 | stems = [file.stem for file in audio_files]
65 |
66 | # Shuffle stems
67 | random.seed(emphases.RANDOM_SEED)
68 | random.shuffle(stems)
69 |
70 | # Get split locations
71 | left = int(emphases.SPLIT_SIZE_TRAIN * len(stems))
72 | right = left + int(emphases.SPLIT_SIZE_VALID * len(stems))
73 |
74 | # Only train on specified eighth for scaling law experiments
75 | if emphases.ONE_EIGHTH_UTTERANCES:
76 |
77 | # Partition
78 | speakers = [str(s) for s in emphases.data.download.LIBRITTS_SPEAKERS]
79 | train = [stem for stem in stems if stem.split('_')[0] in speakers]
80 | valid = [stem for stem in stems[left:right] if stem not in train]
81 | test = [stem for stem in stems[right:] if stem not in train]
82 |
83 | else:
84 |
85 | # Partition
86 | train = stems[:left]
87 | valid = stems[left:right]
88 | test = stems[right:]
89 |
90 | # Maybe limit training set size
91 | if emphases.MAX_TRAINING_UTTERANCES is not None:
92 | train = train[:emphases.MAX_TRAINING_UTTERANCES]
93 |
94 | return {'train': train, 'valid': valid, 'test': test}
95 |
96 |
97 | ###############################################################################
98 | # Dataset creation
99 | ###############################################################################
100 |
101 |
102 | def automatic():
103 | """Partition dataset created from trained model"""
104 | # Get audio files
105 | directory = emphases.CACHE_DIR / 'automatic'
106 | audio_files = directory.rglob('*.wav')
107 |
108 | # Get stems
109 | stems = [file.stem for file in audio_files]
110 |
111 | # Shuffle stems
112 | random.seed(emphases.RANDOM_SEED)
113 | random.shuffle(stems)
114 |
115 | # Get split locations
116 | left = int(emphases.SPLIT_SIZE_TRAIN * len(stems))
117 | right = left + int(emphases.SPLIT_SIZE_VALID * len(stems))
118 |
119 | # Partition
120 | return {
121 | 'train': stems[:left],
122 | 'valid': stems[left:right],
123 | 'test': stems[right:]}
124 |
125 |
126 | def crowdsource():
127 | """Partition crowdsourced dataset"""
128 | # Get audio files
129 | directory = emphases.CACHE_DIR / 'crowdsource'
130 | audio_files = directory.rglob('*.wav')
131 |
132 | # Get stems
133 | stems = [file.stem for file in audio_files]
134 |
135 | # Shuffle stems
136 | random.seed(emphases.RANDOM_SEED)
137 | random.shuffle(stems)
138 |
139 | # Get split locations
140 | left = int(emphases.SPLIT_SIZE_TRAIN * len(stems))
141 | right = left + int(emphases.SPLIT_SIZE_VALID * len(stems))
142 |
143 | # Only train on specified eighth for scaling law experiments
144 | if emphases.ONE_EIGHTH_UTTERANCES:
145 |
146 | # Partition
147 | speakers = [str(s) for s in emphases.data.download.LIBRITTS_SPEAKERS]
148 | train = [stem for stem in stems if stem.split('_')[0] in speakers]
149 | valid = [stem for stem in stems[left:right] if stem not in train]
150 | test = [stem for stem in stems[right:] if stem not in train]
151 |
152 | else:
153 |
154 | # Partition
155 | train = stems[:left]
156 | valid = stems[left:right]
157 | test = stems[right:]
158 |
159 | # Maybe limit training set size
160 | if emphases.MAX_TRAINING_UTTERANCES is not None:
161 | train = train[:emphases.MAX_TRAINING_UTTERANCES]
162 |
163 | return {'train': train, 'valid': valid, 'test': test}
164 |
--------------------------------------------------------------------------------
/emphases/plot/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from . import scaling
3 |
--------------------------------------------------------------------------------
/emphases/plot/core.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 | import torch
4 |
5 |
6 | ###############################################################################
7 | # Plot prominence
8 | ###############################################################################
9 |
10 |
11 | def scores(alignment, scores, targets=None):
12 | """Plot the aligned prominence scores"""
13 | figure, axis = plt.subplots(figsize=(30, 5))
14 | axis.set_axis_off()
15 | axis.set_ylim([0., 1.])
16 |
17 | # Get words, start times, and durations
18 | centers = [word.start() + word.duration() / 2. for word in alignment]
19 | duration = [word.duration() for word in alignment]
20 |
21 | # Plot scores
22 | axis.bar(
23 | centers,
24 | scores,
25 | duration,
26 | edgecolor='black')
27 |
28 | # Plot words and dividers
29 | for word in alignment:
30 | axis.text(
31 | word.start() + word.duration() / 2,
32 | .015,
33 | str(word),
34 | fontsize=10,
35 | rotation=90,
36 | horizontalalignment='center')
37 | axis.axvline(
38 | word.start(),
39 | color='gray',
40 | linewidth=.5,
41 | ymin=0.,
42 | ymax=1.,
43 | clip_on=False,
44 | linestyle='--')
45 | axis.axvline(
46 | alignment.duration(),
47 | color='gray',
48 | linewidth=.5,
49 | ymin=0.,
50 | ymax=1.,
51 | clip_on=False,
52 | linestyle='--')
53 |
54 | if targets is not None:
55 |
56 | # Plot targets
57 | axis.bar(centers, targets, duration)
58 |
59 | # Plot overlap
60 | overlap = torch.minimum(scores, targets)
61 | axis.bar(centers, overlap, duration, color='gray')
62 |
63 | return figure
64 |
--------------------------------------------------------------------------------
/emphases/plot/scaling/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
--------------------------------------------------------------------------------
/emphases/plot/scaling/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from pathlib import Path
3 |
4 | import emphases
5 |
6 |
7 | ###############################################################################
8 | # Scaling laws plot
9 | ###############################################################################
10 |
11 |
12 | def parse_args():
13 | """Parse command-line arguments"""
14 | parser = argparse.ArgumentParser(
15 | description='Create scaling law figure')
16 | parser.add_argument(
17 | '--evaluations',
18 | type=str,
19 | nargs='+',
20 | required=True,
21 | help='The evaluations to plot')
22 | parser.add_argument(
23 | '--xlabel',
24 | type=str,
25 | required=True,
26 | help='Label for x axis')
27 | parser.add_argument(
28 | '--output_file',
29 | type=Path,
30 | required=True,
31 | help='The output jpg file')
32 | parser.add_argument(
33 | '--yticks',
34 | type=float,
35 | nargs='+',
36 | required=True,
37 | help='The y axis tick mark locations')
38 | parser.add_argument(
39 | '--sizes',
40 | type=int,
41 | nargs='+',
42 | help='The number of utterances used in each evaluation')
43 | parser.add_argument(
44 | '--scores',
45 | type=float,
46 | nargs='+',
47 | help='The Pearson Correlation y values')
48 | parser.add_argument(
49 | '--steps',
50 | type=int,
51 | nargs='+',
52 | help='The number of training steps')
53 | parser.add_argument(
54 | '--text_offsets',
55 | type=float,
56 | nargs='+',
57 | help='The amount to space the text below the plot point')
58 | return parser.parse_args()
59 |
60 |
61 | if __name__ == '__main__':
62 | emphases.plot.scaling.scaling_laws(**vars(parse_args()))
63 |
--------------------------------------------------------------------------------
/emphases/plot/scaling/core.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | import matplotlib.pyplot as plt
3 | import torch
4 |
5 | import emphases
6 |
7 |
8 | ###############################################################################
9 | # Plot scaling laws
10 | ###############################################################################
11 |
12 |
13 | def scaling_laws(
14 | evaluations,
15 | xlabel,
16 | output_file,
17 | yticks,
18 | scores=None,
19 | steps=None,
20 | sizes=None,
21 | text_offsets=None):
22 | """Plot scaling laws"""
23 | # Load evaluation results
24 | if scores is None or steps is None:
25 | scores, steps = [], []
26 | for evaluation in evaluations:
27 | path, score = emphases.checkpoint.best_path(
28 | emphases.RUNS_DIR / evaluation)
29 | checkpoint = torch.load(path, map_location='cpu')
30 | scores.append(score)
31 | steps.append(checkpoint['step'])
32 |
33 | # Get x values
34 | x = [int(eval.split('-')[-1]) for eval in evaluations]
35 |
36 | # Create plot
37 | figure, axis = plt.subplots(figsize=(8, 2))
38 |
39 | # Remove frame
40 | axis.spines['top'].set_visible(False)
41 | axis.spines['right'].set_visible(False)
42 | axis.spines['bottom'].set_visible(False)
43 | axis.spines['left'].set_visible(False)
44 |
45 | # Format x axis
46 | x_range = max(x) - min(x)
47 | axis.set_xlim([0, max(x) + 0.1 * x_range])
48 | axis.get_xaxis().set_ticks(x)
49 | axis.set_xlabel(xlabel)
50 | axis.xaxis.set_ticks(x)
51 | axis.xaxis.set_ticklabels(x)
52 |
53 | # Format y axis
54 | axis.get_yaxis().set_ticks(yticks)
55 | axis.set_ylim([min(yticks) - .002, max(yticks) + .002])
56 | axis.tick_params(axis=u'both', which=u'both',length=0)
57 | axis.set_ylabel('Pearson correlation')
58 |
59 | # Grid lines
60 | for tick in yticks:
61 | axis.axhline(tick, color='gray', linestyle='--', linewidth=.8)
62 |
63 | # Plot
64 | colors = ['blue', 'orange', 'purple', 'red']
65 | for i in range(len(x)):
66 | axis.scatter(x[i], scores[i], color=colors[i])
67 |
68 | # Default text offset
69 | if text_offsets is None:
70 | text_offsets = [0.011] * len(evaluations)
71 |
72 | # Annotate
73 | for i in range(len(evaluations)):
74 | text = f'steps={steps[i]}'
75 | if sizes is not None:
76 | text += f'\nutterances={sizes[i]}'
77 | axis.text(
78 | x[i],
79 | scores[i] - text_offsets[i],
80 | text,
81 | horizontalalignment='center')
82 |
83 | # Save
84 | figure.savefig(output_file, bbox_inches='tight', pad_inches=0, dpi=300)
85 |
--------------------------------------------------------------------------------
/emphases/train/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 |
--------------------------------------------------------------------------------
/emphases/train/__main__.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import shutil
3 | from pathlib import Path
4 |
5 | import torchutil
6 |
7 | import emphases
8 |
9 |
10 | ###############################################################################
11 | # Entry point
12 | ###############################################################################
13 |
14 |
15 | def main(config, dataset, gpu=None):
16 | # Create output directory
17 | directory = emphases.RUNS_DIR / config.stem
18 | directory.mkdir(parents=True, exist_ok=True)
19 |
20 | # Save configuration
21 | shutil.copyfile(config, directory / config.name)
22 |
23 | # Train
24 | emphases.train(dataset, directory, gpu)
25 |
26 | # Get best checkpoint
27 | checkpoint = torchutil.checkpoint.best_path(directory)[0]
28 |
29 | # Evaluate
30 | emphases.evaluate.datasets(emphases.EVALUATION_DATASETS, checkpoint, gpu)
31 |
32 |
33 | def parse_args():
34 | """Parse command-line arguments"""
35 | parser = argparse.ArgumentParser(description='Train a model')
36 | parser.add_argument(
37 | '--config',
38 | type=Path,
39 | help='The configuration file')
40 | parser.add_argument(
41 | '--dataset',
42 | default=emphases.TRAINING_DATASET,
43 | help='The dataset to train on')
44 | parser.add_argument(
45 | '--gpu',
46 | type=int,
47 | help='The gpu to run training on')
48 | return parser.parse_args()
49 |
50 |
51 | if __name__ == '__main__':
52 | main(**vars(parse_args()))
53 |
--------------------------------------------------------------------------------
/emphases/train/core.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torchutil
3 |
4 | import emphases
5 |
6 |
7 | ###############################################################################
8 | # Training
9 | ###############################################################################
10 |
11 |
12 | @torchutil.notify('train')
13 | def train(dataset, directory, gpu=None):
14 | """Train a model"""
15 |
16 | # Get torch device
17 | device = torch.device('cpu' if gpu is None else f'cuda:{gpu}')
18 |
19 | #######################
20 | # Create data loaders #
21 | #######################
22 |
23 | torch.manual_seed(emphases.RANDOM_SEED)
24 |
25 | # Training data
26 | train_loader = emphases.data.loader(dataset, 'train', gpu)
27 |
28 | # Validation data
29 | if emphases.VALIDATION_DATASET == 'buckeye':
30 |
31 | # This is just for generating scaling law plots for the paper
32 | valid_loader = emphases.data.loader('buckeye', 'test', gpu)
33 |
34 | else:
35 |
36 | valid_loader = emphases.data.loader(dataset, 'valid', gpu)
37 |
38 | ################
39 | # Create model #
40 | ################
41 |
42 | model = emphases.Model().to(device)
43 |
44 | ####################
45 | # Create optimizer #
46 | ####################
47 |
48 | optimizer = torch.optim.Adam(model.parameters())
49 |
50 | ##############################
51 | # Maybe load from checkpoint #
52 | ##############################
53 |
54 | path = torchutil.checkpoint.latest_path(directory)
55 |
56 | if path is not None:
57 |
58 | # Load model
59 | model, optimizer, state = torchutil.checkpoint.load(
60 | path,
61 | model,
62 | optimizer)
63 | epoch = state['epoch']
64 | step = state['step']
65 | score = state['score']
66 | best = state['best']
67 |
68 | else:
69 |
70 | # Train from scratch
71 | epoch, step, score, best = 0, 0, 0., 0.
72 |
73 | #########
74 | # Train #
75 | #########
76 |
77 | # Automatic mixed precision (amp) gradient scaler
78 | scaler = torch.cuda.amp.GradScaler()
79 |
80 | # Setup progress bar
81 | progress = torchutil.iterator(
82 | range(step, emphases.NUM_STEPS),
83 | f'Training {emphases.CONFIG}',
84 | step,
85 | emphases.NUM_STEPS)
86 | while step < emphases.NUM_STEPS:
87 |
88 | # Seed sampler
89 | train_loader.batch_sampler.set_epoch(epoch)
90 |
91 | for batch in train_loader:
92 |
93 | # Unpack batch
94 | (
95 | features,
96 | frame_lengths,
97 | word_bounds,
98 | word_lengths,
99 | targets,
100 | _, # alignment
101 | _, # audio
102 | _ # stem
103 | ) = batch
104 |
105 | # Copy to GPU
106 | features = features.to(device)
107 | frame_lengths = frame_lengths.to(device)
108 | word_bounds = word_bounds.to(device)
109 | word_lengths = word_lengths.to(device)
110 | targets = targets.to(device)
111 | with torch.autocast(device.type):
112 |
113 | # Forward pass
114 | scores = model(
115 | features,
116 | frame_lengths,
117 | word_bounds,
118 | word_lengths)
119 |
120 | # Compute loss
121 | train_loss = loss(
122 | scores,
123 | targets,
124 | frame_lengths,
125 | word_bounds,
126 | word_lengths,
127 | training=True)
128 |
129 | ##################
130 | # Optimize model #
131 | ##################
132 |
133 | optimizer.zero_grad()
134 |
135 | # Backward pass
136 | scaler.scale(train_loss).backward()
137 |
138 | # Update weights
139 | scaler.step(optimizer)
140 |
141 | # Update gradient scaler
142 | scaler.update()
143 |
144 | ############
145 | # Evaluate #
146 | ############
147 |
148 | if step % emphases.LOG_INTERVAL == 0:
149 | score = evaluate(
150 | directory,
151 | step,
152 | model,
153 | gpu,
154 | 'valid',
155 | valid_loader)
156 |
157 | ###################
158 | # Save checkpoint #
159 | ###################
160 |
161 | if step >= 300 and score > best:
162 | torchutil.checkpoint.save(
163 | directory / f'{step:08d}.pt',
164 | model,
165 | optimizer,
166 | epoch=epoch,
167 | step=step,
168 | score=score,
169 | best=best)
170 | best = score
171 |
172 | # End training after a certain number of steps
173 | if step >= emphases.NUM_STEPS:
174 | break
175 |
176 | # Update training step count
177 | step += 1
178 |
179 | # Update progress bar
180 | progress.update()
181 |
182 | # Update epoch count
183 | epoch += 1
184 |
185 | # Close progress bar
186 | progress.close()
187 |
188 | # Save final model
189 | torchutil.checkpoint.save(
190 | directory / f'{step:08d}.pt',
191 | model,
192 | optimizer,
193 | epoch=epoch,
194 | step=step,
195 | score=score,
196 | best=best)
197 |
198 |
199 | ###############################################################################
200 | # Evaluation
201 | ###############################################################################
202 |
203 |
204 | def evaluate(directory, step, model, gpu, condition, loader):
205 | """Perform model evaluation"""
206 | device = 'cpu' if gpu is None else f'cuda:{gpu}'
207 |
208 | # Tensorboard audio and figures
209 | waveforms, figures = {}, {}
210 |
211 | # Prepare model for inference
212 | with emphases.inference_context(model):
213 |
214 | # Cache results to evaluate
215 | results = []
216 | for i, batch in enumerate(loader):
217 |
218 | # Unpack batch
219 | (
220 | features,
221 | frame_lengths,
222 | word_bounds,
223 | word_lengths,
224 | targets,
225 | alignments,
226 | audio,
227 | stems
228 | ) = batch
229 |
230 | # Copy to GPU
231 | features = features.to(device)
232 | frame_lengths = frame_lengths.to(device)
233 | word_bounds = word_bounds.to(device)
234 | word_lengths = word_lengths.to(device)
235 | targets = targets.to(device)
236 |
237 | # Forward pass
238 | logits = model(
239 | features,
240 | frame_lengths,
241 | word_bounds,
242 | word_lengths)
243 |
244 | # Cache results
245 | results.append((
246 | logits.detach().cpu(),
247 | targets.detach().cpu(),
248 | word_lengths.detach().cpu()))
249 |
250 | # Add audio and figures
251 | if condition == 'valid' and i < emphases.PLOT_EXAMPLES:
252 |
253 | # Postprocess network output
254 | scores = emphases.postprocess(logits)
255 |
256 | # Add audio
257 | samples = emphases.convert.frames_to_samples(frame_lengths[0])
258 | waveforms[f'audio/{stems[0]}'] = audio[0, :, :samples]
259 |
260 | # Add figure
261 | figures[stems[0]] = emphases.plot.scores(
262 | alignments[0],
263 | scores[0, 0, :word_lengths[0]].cpu(),
264 | targets[0, 0, :word_lengths[0]].cpu())
265 |
266 | # Stop when we exceed some number of batches
267 | if i + 1 == emphases.LOG_STEPS:
268 | break
269 |
270 | # Setup batch statistics
271 | target_stats = emphases.evaluate.metrics.Statistics()
272 | predicted_stats = emphases.evaluate.metrics.Statistics()
273 |
274 | # Update statistics
275 | for logits, targets, word_lengths in results:
276 | target_stats.update(
277 | targets.to(device),
278 | word_lengths.to(device))
279 | predicted_stats.update(
280 | emphases.postprocess(logits.to(device)),
281 | word_lengths.to(device))
282 |
283 | # Setup evaluation metrics
284 | metrics = emphases.evaluate.Metrics(predicted_stats, target_stats)
285 |
286 | # Update metrics
287 | for logits, targets, word_lengths in results:
288 | metrics.update(
289 | logits.to(device),
290 | targets.to(device),
291 | word_lengths.to(device))
292 |
293 | # Format results
294 | scalars = {
295 | f'{key}/{condition}': value for key, value in metrics().items()}
296 |
297 | # Write to tensorboard
298 | torchutil.tensorboard.update(
299 | directory,
300 | step,
301 | scalars=scalars,
302 | figures=figures,
303 | audio=waveforms,
304 | sample_rate=emphases.SAMPLE_RATE)
305 |
306 | # Return Pearson correlation
307 | return scalars[f'pearson_correlation/{condition}']
308 |
309 |
310 | ###############################################################################
311 | # Loss function
312 | ###############################################################################
313 |
314 |
315 | def loss(
316 | scores,
317 | targets,
318 | frame_lengths,
319 | word_bounds,
320 | word_lengths,
321 | training=False,
322 | loss_fn=emphases.LOSS):
323 | """Compute masked loss"""
324 | if training and emphases.DOWNSAMPLE_LOCATION == 'inference':
325 |
326 | # If we are not downsampling the network output before the loss, we
327 | # must upsample the targets
328 | targets = emphases.upsample(
329 | targets,
330 | word_bounds,
331 | word_lengths,
332 | frame_lengths)
333 |
334 | # Linear interpolation can cause out-of-range
335 | if emphases.UPSAMPLE_METHOD == 'linear':
336 | targets = torch.clamp(targets, min=0., max=1.)
337 |
338 | # Frame resolution sequence mask
339 | mask = emphases.model.mask_from_lengths(frame_lengths)
340 |
341 | else:
342 |
343 | # Word resolution sequence mask
344 | mask = emphases.model.mask_from_lengths(word_lengths)
345 |
346 | # Compute masked loss
347 | if loss_fn == 'bce':
348 | return torch.nn.functional.binary_cross_entropy_with_logits(
349 | scores[mask],
350 | targets[mask])
351 | elif loss_fn == 'mse':
352 | return torch.nn.functional.mse_loss(scores[mask], targets[mask])
353 | raise ValueError(f'Loss {loss_fn} is not recognized')
354 |
--------------------------------------------------------------------------------
/eval/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/eval/.gitkeep
--------------------------------------------------------------------------------
/notebooks/select-speakers.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "f473bbab-e880-4f10-be02-f2abf38ca9ad",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "%load_ext autoreload\n",
11 | "%autoreload 2"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "id": "6e996f74-4c77-469a-a333-062febcaa78b",
18 | "metadata": {
19 | "tags": []
20 | },
21 | "outputs": [],
22 | "source": [
23 | "import random\n",
24 | "\n",
25 | "import IPython.display as ipd\n",
26 | "import torchaudio\n",
27 | "\n",
28 | "import emphases"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": null,
34 | "id": "9004ca30-9fa2-436b-ad2c-b778b895e6f6",
35 | "metadata": {
36 | "tags": []
37 | },
38 | "outputs": [],
39 | "source": [
40 | "dataset = 'libritts'\n",
41 | "directory = emphases.CACHE_DIR / dataset\n",
42 | "files = list(directory.rglob('*.wav'))"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": null,
48 | "id": "7c23645c-95c3-45f7-8945-40ca6ff0c064",
49 | "metadata": {
50 | "tags": []
51 | },
52 | "outputs": [],
53 | "source": [
54 | "speakers = sorted(list(set(file.stem.split('_')[0] for file in files)))\n",
55 | "speaker_sizes = {speaker: 0. for speaker in speakers}\n",
56 | "for file in files:\n",
57 | " info = torchaudio.info(file)\n",
58 | " speaker_sizes[file.stem.split('_')[0]] += info.num_frames / info.sample_rate\n",
59 | "total = sum(speaker_sizes.values())\n",
60 | "total"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": null,
66 | "id": "fe70a29b-d5fc-4a32-b08f-21c67776b420",
67 | "metadata": {
68 | "tags": []
69 | },
70 | "outputs": [],
71 | "source": [
72 | "current = [\n",
73 | " # Top 5 Female\n",
74 | " 40,\n",
75 | " 669,\n",
76 | " 4362,\n",
77 | " 5022,\n",
78 | " 8123,\n",
79 | " \n",
80 | " # Additional female speakers to get to 1/8th \n",
81 | " 5022,\n",
82 | " 696,\n",
83 | " 6272,\n",
84 | " 5163,\n",
85 | "\n",
86 | " # Top 5 Male\n",
87 | " 196,\n",
88 | " 460,\n",
89 | " 1355,\n",
90 | " 3664,\n",
91 | " 7067, # uses character voices\n",
92 | " \n",
93 | " # Additional male speakers to get to 1/8th \n",
94 | " 405,\n",
95 | " 6437,\n",
96 | " 446, # uses character voices\n",
97 | " 4397\n",
98 | "]\n",
99 | "current_total = sum(speaker_sizes[str(speaker)] for speaker in current) \n",
100 | "current_total"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "id": "9abbbc3f-5d18-48d2-ae57-da36fa322da9",
107 | "metadata": {
108 | "tags": []
109 | },
110 | "outputs": [],
111 | "source": [
112 | "current_total / total / (1/8)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "id": "944611a7-7a50-4b86-b86d-e79a83d91f8d",
119 | "metadata": {
120 | "tags": []
121 | },
122 | "outputs": [],
123 | "source": [
124 | "speaker = 4397\n",
125 | "files = [file for file in (directory / 'audio').rglob('*.wav') if file.stem.startswith(f'{speaker}_')]\n",
126 | "ipd.display(ipd.Audio(random.choice(files)))"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": null,
132 | "id": "f1c42bc9-95bc-4ceb-8a88-dc6b60867bb1",
133 | "metadata": {
134 | "tags": []
135 | },
136 | "outputs": [],
137 | "source": [
138 | "candidates = sorted(speaker_sizes.items(), key=lambda item: item[1], reverse=True)\n",
139 | "candidates"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "id": "2e7925d3-6c35-459e-af2c-f9af74c21bf7",
146 | "metadata": {},
147 | "outputs": [],
148 | "source": []
149 | }
150 | ],
151 | "metadata": {
152 | "kernelspec": {
153 | "display_name": "env",
154 | "language": "python",
155 | "name": "env"
156 | },
157 | "language_info": {
158 | "codemirror_mode": {
159 | "name": "ipython",
160 | "version": 3
161 | },
162 | "file_extension": ".py",
163 | "mimetype": "text/x-python",
164 | "name": "python",
165 | "nbconvert_exporter": "python",
166 | "pygments_lexer": "ipython3",
167 | "version": "3.9.16"
168 | }
169 | },
170 | "nbformat": 4,
171 | "nbformat_minor": 5
172 | }
173 |
--------------------------------------------------------------------------------
/results/scaling-annotators.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/results/scaling-annotators.pdf
--------------------------------------------------------------------------------
/results/scaling-data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/results/scaling-data.pdf
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | # Runs experiments in the paper
2 | # "Crowdsourced and Automatic Speech Prominence Estimation"
3 |
4 | # Args
5 | # $1 - the GPU index
6 |
7 | SCRIPTDIR="$( dirname -- "$0"; )"
8 |
9 | ####################################
10 | # Annotator redundancy experiments #
11 | ####################################
12 |
13 |
14 | # N.B. - These experiments require Buckeye for evaluation and are therefore
15 | # commented out (see note in README).
16 |
17 | # # 1/64; 8 annotations
18 | # rm -rf data/cache/crowdsource/*
19 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/64-8.py
20 | # python -m emphases.data.preprocess --gpu $1
21 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/64-8.py
22 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/64-8.py --gpu $1
23 |
24 | # # 1/32; 4 annotations
25 | # rm -rf data/cache/crowdsource/*
26 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/32-4.py
27 | # python -m emphases.data.preprocess --gpu $1
28 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/32-4.py
29 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/32-4.py --gpu $1
30 |
31 | # # 1/16; 2 annotations
32 | # rm -rf data/cache/crowdsource/*
33 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/16-2.py
34 | # python -m emphases.data.preprocess --gpu $1
35 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/16-2.py
36 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/16-2.py --gpu $1
37 |
38 | # # 1/8; 1 annotations
39 | # rm -rf data/cache/crowdsource/*
40 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/8-1.py
41 | # python -m emphases.data.preprocess --gpu $1
42 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/8-1.py
43 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/8-1.py --gpu $1
44 |
45 | # # Plot results
46 | # python -m emphases.plot.scaling \
47 | # --evaluations 8-1 16-2 32-4 64-8 \
48 | # --xlabel "Annotators per utterance" \
49 | # --output_file results/scaling-annotators.pdf \
50 | # --sizes 3200 1600 800 400 \
51 | # --scores 0.686 0.683 0.667 0.664 \
52 | # --steps 967 933 567 467 \
53 | # --yticks 0.66 0.67 0.68 0.69 \
54 | # --text_offsets 0.007 0.01 0.007 0.007
55 |
56 |
57 | # ####################################
58 | # # Dataset size scaling experiments #
59 | # ####################################
60 |
61 |
62 | # N.B. - These experiments require Buckeye for evaluation and are therefore
63 | # commented out (see note in README).
64 |
65 | # # 400 utterances
66 | # rm -rf data/cache/crowdsource/*
67 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/400.py
68 | # python -m emphases.data.preprocess --gpu $1
69 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/400.py
70 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/400.py --gpu $1
71 |
72 | # # 800 utterances
73 | # rm -rf data/cache/crowdsource/*
74 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/800.py
75 | # python -m emphases.data.preprocess --gpu $1
76 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/800.py
77 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/800.py --gpu $1
78 |
79 | # # 1600 utterances
80 | # rm -rf data/cache/crowdsource/*
81 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/1600.py
82 | # python -m emphases.data.preprocess --gpu $1
83 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/1600.py
84 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/1600.py --gpu $1
85 |
86 | # # 3200 utterances
87 | # rm -rf data/cache/crowdsource/*
88 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/3200.py
89 | # python -m emphases.data.preprocess --gpu $1
90 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/3200.py
91 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/3200.py --gpu $1
92 |
93 | # # Plot results
94 | # python -m emphases.plot.scaling \
95 | # --evaluations 400 800 1600 3200 \
96 | # --xlabel Utterances \
97 | # --output_file results/scaling-data.pdf \
98 | # --yticks 0.63 0.65 0.67 0.69 \
99 | # --scores 0.633 0.657 0.678 0.687 \
100 | # --steps 400 500 767 1433 \
101 | # --text_offsets 0.007 0.007 0.007 0.007
102 |
103 |
104 | ##############
105 | # Best model #
106 | ##############
107 |
108 |
109 | python -m emphases.data.download
110 | python -m emphases.data.preprocess --gpu $1
111 | python -m emphases.partition
112 | python -m emphases.train --config $SCRIPTDIR/config/base.py --gpu $1
113 |
114 |
115 | #############
116 | # Ablations #
117 | #############
118 |
119 |
120 | python -m emphases.train --config $SCRIPTDIR/config/hparam-search/mse.py --gpu $1
121 |
122 |
123 | ##############
124 | # Downsample #
125 | ##############
126 |
127 |
128 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-inference.py --gpu $1
129 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-intermediate.py --gpu $1
130 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-input.py --gpu $1
131 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-loss.py --gpu $1
132 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-inference.py --gpu $1
133 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-intermediate.py --gpu $1
134 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-input.py --gpu $1
135 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-loss.py --gpu $1
136 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-inference.py --gpu $1
137 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-intermediate.py --gpu $1
138 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-input.py --gpu $1
139 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-loss.py --gpu $1
140 |
141 |
142 | ####################################
143 | # Large-scale automatic annotation #
144 | ####################################
145 |
146 |
147 | python -m emphases.data.download --datasets automatic --gpu $1
148 | python -m emphases.partition --datasets automatic
149 | python -m emphases.data.preprocess --datasets automatic --gpu $1
150 | python -m emphases.train --config $SCRIPTDIR/config/scaling/base-automatic.py --dataset automatic --gpu $1
151 |
152 |
153 | #############
154 | # Baselines #
155 | #############
156 |
157 |
158 | python -m emphases.evaluate --config $SCRIPTDIR/config/baselines/prominence.py
159 |
--------------------------------------------------------------------------------
/runs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/runs/.gitkeep
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 |
4 | with open('README.md', encoding='utf8') as file:
5 | long_description = file.read()
6 |
7 |
8 | setup(
9 | name='emphases',
10 | description='Crowdsourced and Automatic Speech Prominence Estimation',
11 | version='0.0.2',
12 | author='Interactive Audio Lab',
13 | author_email='interactiveaudiolab@gmail.com',
14 | url='https://github.com/interactiveaudiolab/emphases',
15 | install_requires=[
16 | 'GPUtil',
17 | 'huggingface-hub',
18 | 'librosa',
19 | 'matplotlib',
20 | 'numpy',
21 | 'penn',
22 | 'pycwt',
23 | 'pyfoal',
24 | 'pypar',
25 | 'pyyaml',
26 | 'reseval',
27 | 'scipy',
28 | 'torch',
29 | 'torchutil',
30 | 'torchaudio',
31 | 'yapecs'],
32 | packages=find_packages(),
33 | package_data={'emphases': ['assets/*', 'assets/*/*']},
34 | long_description=long_description,
35 | long_description_content_type='text/markdown',
36 | keywords=['annotatation', 'audio', 'emphasis', 'prominence', 'speech'],
37 | classifiers=['License :: OSI Approved :: MIT License'],
38 | license='MIT')
39 |
--------------------------------------------------------------------------------