├── .gitignore
├── LICENSE
├── README.md
├── config
    ├── base.py
    ├── baselines
    │   ├── duration-variance.py
    │   ├── pitch-variance.py
    │   └── prominence.py
    ├── downsample
    │   ├── average-inference.py
    │   ├── average-input.py
    │   ├── average-intermediate.py
    │   ├── average-loss.py
    │   ├── center-inference.py
    │   ├── center-input.py
    │   ├── center-intermediate.py
    │   ├── center-loss.py
    │   ├── max-inference.py
    │   ├── max-input.py
    │   ├── max-intermediate.py
    │   ├── max-loss.py
    │   ├── sum-inference.py
    │   ├── sum-input.py
    │   ├── sum-intermediate.py
    │   └── sum-loss.py
    ├── hparam-search
    │   ├── batch-060000.py
    │   ├── batch-070000.py
    │   ├── batch-075000.py
    │   ├── batch-100000.py
    │   ├── buckets-1.py
    │   ├── buckets-2.py
    │   ├── convolution-5-80.py
    │   ├── convolution-6-128.py
    │   ├── convolution-6-64.py
    │   ├── convolution-6-80.py
    │   ├── convolution-7-80.py
    │   ├── decoder-kernel-1.py
    │   ├── decoder-kernel-5.py
    │   ├── dropout-05.py
    │   ├── dropout-10.py
    │   ├── encoder-kernel-5.py
    │   ├── encoder-kernel-7.py
    │   ├── gelu.py
    │   ├── leaky-relu.py
    │   ├── mse.py
    │   └── silu.py
    └── scaling
    │   ├── 16-2.py
    │   ├── 1600.py
    │   ├── 32-4.py
    │   ├── 3200.py
    │   ├── 400.py
    │   ├── 64-8.py
    │   ├── 8-1.py
    │   ├── 800.py
    │   └── base-automatic.py
├── data
    ├── cache
    │   └── .gitkeep
    ├── datasets
    │   └── .gitkeep
    └── sources
    │   └── .gitkeep
├── emphases
    ├── __init__.py
    ├── __main__.py
    ├── annotate
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── core.py
    ├── assets
    │   ├── checkpoints
    │   │   ├── .gitkeep
    │   │   └── checkpoint.pt
    │   ├── configs
    │   │   └── annotate.yaml
    │   └── partitions
    │   │   ├── .gitkeep
    │   │   ├── automatic.json
    │   │   ├── buckeye.json
    │   │   ├── crowdsource.json
    │   │   └── libritts.json
    ├── baselines
    │   ├── __init__.py
    │   ├── duration_variance
    │   │   ├── __init__.py
    │   │   └── core.py
    │   ├── pitch_variance
    │   │   ├── __init__.py
    │   │   └── core.py
    │   └── prominence
    │   │   ├── __init__.py
    │   │   ├── core.py
    │   │   ├── cwt_utils.py
    │   │   ├── duration_processing.py
    │   │   ├── energy_processing.py
    │   │   ├── f0_processing.py
    │   │   ├── filter.py
    │   │   ├── loma.py
    │   │   ├── pitch_tracker.py
    │   │   └── smooth_and_interp.py
    ├── config
    │   ├── __init__.py
    │   ├── defaults.py
    │   └── static.py
    ├── convert.py
    ├── core.py
    ├── data
    │   ├── __init__.py
    │   ├── collate.py
    │   ├── dataset.py
    │   ├── download
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   └── core.py
    │   ├── loader.py
    │   ├── preprocess
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── core.py
    │   │   ├── loudness.py
    │   │   └── mels.py
    │   └── sampler.py
    ├── evaluate
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── core.py
    │   └── metrics.py
    ├── load.py
    ├── model
    │   ├── __init__.py
    │   ├── core.py
    │   └── layers
    │   │   ├── __init__.py
    │   │   ├── convolution.py
    │   │   └── transformer.py
    ├── partition
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── core.py
    ├── plot
    │   ├── __init__.py
    │   ├── core.py
    │   └── scaling
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   └── core.py
    └── train
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── core.py
├── eval
    └── .gitkeep
├── notebooks
    ├── analyze-annotations.ipynb
    └── select-speakers.ipynb
├── results
    ├── scaling-annotators.pdf
    └── scaling-data.pdf
├── run.sh
├── runs
    └── .gitkeep
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/cache/*
 2 | !data/cache/.gitkeep
 3 | data/datasets/*
 4 | !data/datasets/.gitkeep
 5 | data/sources/*
 6 | !data/sources/.gitkeep
 7 | eval/*
 8 | !eval/.gitkeep
 9 | runs/*
10 | !runs/.gitkeep
11 | htk/
12 | config/hyperparam-search/*
13 | utils/*
14 | 
15 | __pycache__/
16 | .DS_Store
17 | ._.DS_Store
18 | .ipynb_checkpoints/
19 | .vscode/
20 | *.egg-info/
21 | */.ipynb_checkpoints/*
22 | dist/
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Interactive Audio Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">Crowdsourced and Automatic Speech Prominence Estimation</h1>
  2 | <div align="center">
  3 | 
  4 | [![PyPI](https://img.shields.io/pypi/v/emphases.svg)](https://pypi.python.org/pypi/emphases)
  5 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
  6 | [![Downloads](https://static.pepy.tech/badge/emphases)](https://pepy.tech/project/emphases)
  7 | 
  8 | Annotation, training, evaluation and inference of speech prominence
  9 | 
 10 | [Paper](https://www.maxrmorrison.com/pdfs/morrison2024crowdsourced.pdf) [Website](https://www.maxrmorrison.com/sites/prominence-estimation) [Dataset](https://zenodo.org/records/10402793)
 11 | 
 12 | </div>
 13 | 
 14 | 
 15 | ## Table of contents
 16 | 
 17 | - [Installation](#installation)
 18 | - [Inference](#inference)
 19 |     * [Application programming interface](#application-programming-interface)
 20 |         * [`emphases.from_alignment_and_audio`](#emphasesfrom_alignment_and_audio)
 21 |         * [`emphases.from_text_and_audio`](#emphasesfrom_text_and_audio)
 22 |         * [`emphases.from_file`](#emphasesfrom_file)
 23 |         * [`emphases.from_file_to_file`](#emphasesfrom_file_to_file)
 24 |         * [`emphases.from_files_to_files`](#emphasesfrom_files_to_files)
 25 |     * [Command-line interface](#command-line-interface)
 26 | - [Training](#training)
 27 |     * [Download](#download)
 28 |     * [Annotate](#annotate)
 29 |     * [Preprocess](#preprocess)
 30 |     * [Partition](#partition)
 31 |     * [Train](#train)
 32 |     * [Monitor](#monitor)
 33 | - [Evaluation](#reproducing-results)
 34 |     * [Evaluate](#evaluate)
 35 |     * [Analyze](#analyze)
 36 | - [Citation](#citation)
 37 | 
 38 | 
 39 | ## Installation
 40 | 
 41 | `pip install emphases`
 42 | 
 43 | By default, we use the Penn Phonetic Forced Aligner (P2FA) via the [`pyfoal`](https://github.com/maxrmorrison/pyfoal/)
 44 | repo to perform word alignments. This requires installing HTK. See [the HTK
 45 | installation instructions](https://github.com/maxrmorrison/pyfoal/tree/main?tab=readme-ov-file#penn-phonetic-forced-aligner-p2fa)
 46 | provided by `pyfoal`. Alternatively, you can use a different forced aligner
 47 | and either pass the alignment as a [`pypar.Alignment`](https://github.com/maxrmorrison/pypar/tree/main)
 48 | object or save the alignment as a `.TextGrid` file.
 49 | 
 50 | 
 51 | ## Inference
 52 | 
 53 | Perform automatic emphasis annotation using our best pretrained model
 54 | 
 55 | ```python
 56 | import emphases
 57 | 
 58 | # Text and audio of speech
 59 | text_file = 'example.txt'
 60 | audio_file = 'example.wav'
 61 | 
 62 | # Detect emphases
 63 | alignment, prominence = emphases.from_file(text_file, audio_file)
 64 | 
 65 | # Check which words were emphasized
 66 | for word, score in zip(alignment, prominence[0]):
 67 |     print(f'{word} has a prominence of {score}')
 68 | ```
 69 | 
 70 | The `alignment` is a [`pypar.Alignment`](https://github.com/maxrmorrison/pypar)
 71 | object.
 72 | 
 73 | 
 74 | ### Application programming interface
 75 | 
 76 | #### `emphases.from_alignment_and_audio`
 77 | 
 78 | ```python
 79 | def from_alignment_and_audio(
 80 |     alignment: pypar.Alignment,
 81 |     audio: torch.Tensor,
 82 |     sample_rate: int,
 83 |     checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
 84 |     batch_size: Optional[int] = None,
 85 |     gpu: Optional[int] = None
 86 | ) -> Tuple[Type[pypar.Alignment], torch.Tensor]:
 87 |     """Produce emphasis scores for each word
 88 | 
 89 |     Args:
 90 |         alignment: The forced phoneme alignment
 91 |         audio: The speech waveform
 92 |         sample_rate: The audio sampling rate
 93 |         checkpoint: The model checkpoint to use for inference
 94 |         batch_size: The maximum number of frames per batch
 95 |         gpu: The index of the gpu to run inference on
 96 | 
 97 |     Returns:
 98 |         scores: The float-valued emphasis scores for each word
 99 |     """
100 | ```
101 | 
102 | 
103 | #### `emphases.from_text_and_audio`
104 | 
105 | ```python
106 | def from_text_and_audio(
107 |     text: str,
108 |     audio: torch.Tensor,
109 |     sample_rate: int,
110 |     checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
111 |     batch_size: Optional[int] = None,
112 |     gpu: Optional[int] = None
113 | ) -> Tuple[Type[pypar.Alignment], torch.Tensor]:
114 |     """Produce emphasis scores for each word
115 | 
116 |     Args:
117 |         text: The speech transcript
118 |         audio: The speech waveform
119 |         sample_rate: The audio sampling rate
120 |         checkpoint: The model checkpoint to use for inference
121 |         batch_size: The maximum number of frames per batch
122 |         gpu: The index of the gpu to run inference on
123 | 
124 |     Returns:
125 |         alignment: The forced phoneme alignment
126 |         scores: The float-valued emphasis scores for each word
127 |     """
128 | ```
129 | 
130 | 
131 | #### `emphases.from_file`
132 | 
133 | ```python
134 | def from_file(
135 |     text_file: Union[str, bytes, os.PathLike],
136 |     audio_file: Union[str, bytes, os.PathLike],
137 |     checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
138 |     batch_size: Optional[int] = None,
139 |     gpu: Optional[int] = None
140 | ) -> Tuple[Type[pypar.Alignment], torch.Tensor]:
141 |     """Produce emphasis scores for each word for files on disk
142 | 
143 |     Args:
144 |         text_file: The speech transcript (.txt) or alignment (.TextGrid) file
145 |         audio_file: The speech waveform audio file
146 |         checkpoint: The model checkpoint to use for inference
147 |         batch_size: The maximum number of frames per batch
148 |         gpu: The index of the gpu to run inference on
149 | 
150 |     Returns:
151 |         alignment: The forced phoneme alignment
152 |         scores: The float-valued emphasis scores for each word
153 |     """
154 | ```
155 | 
156 | 
157 | #### `emphases.from_file_to_file`
158 | 
159 | ```python
160 | def from_file_to_file(
161 |     text_file: List[Union[str, bytes, os.PathLike]],
162 |     audio_file: List[Union[str, bytes, os.PathLike]],
163 |     output_prefix: Optional[List[Union[str, bytes, os.PathLike]]] = None,
164 |     checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
165 |     batch_size: Optional[int] = None,
166 |     gpu: Optional[int] = None
167 | ) -> None:
168 |     """Produce emphasis scores for each word for files on disk and save to disk
169 | 
170 |     Args:
171 |         text_file: The speech transcript (.txt) or alignment (.TextGrid) file
172 |         audio_file: The speech waveform audio file
173 |         output_prefix: The output prefix. Defaults to text file stem.
174 |         checkpoint: The model checkpoint to use for inference
175 |         batch_size: The maximum number of frames per batch
176 |         gpu: The index of the gpu to run inference on
177 |     """
178 | ```
179 | 
180 | Emphases are saved as a list of five-tuples containing the word, start time,
181 | end time, a float-valued emphasis score, and a boolean that is true if the
182 | word is emphasized.
183 | 
184 | 
185 | #### `emphases.from_files_to_files`
186 | 
187 | ```python
188 | def from_files_to_files(
189 |     text_files: List[Union[str, bytes, os.PathLike]],
190 |     audio_files: List[Union[str, bytes, os.PathLike]],
191 |     output_prefixes: Optional[List[Union[str, bytes, os.PathLike]]] = None,
192 |     checkpoint: Optional[Union[str, bytes, os.PathLike]] = None,
193 |     batch_size: Optional[int] = None,
194 |     gpu: Optional[int] = None
195 | ) -> None:
196 |     """Produce emphasis scores for each word for many files and save to disk
197 | 
198 |     Args:
199 |         text_file: The speech transcript (.txt) or alignment (.TextGrid) files
200 |         audio_files: The corresponding speech audio files
201 |         output_prefixes: The output files. Defaults to text file stems.
202 |         checkpoint: The model checkpoint to use for inference
203 |         batch_size: The maximum number of frames per batch
204 |         gpu: The index of the gpu to run inference on
205 |     """
206 | ```
207 | 
208 | 
209 | ### Command-line interface
210 | 
211 | ```
212 | python -m emphases
213 |     [-h]
214 |     --text_files TEXT_FILES [TEXT_FILES ...]
215 |     --audio_files AUDIO_FILES [AUDIO_FILES ...]
216 |     [--output_files OUTPUT_FILES [OUTPUT_FILES ...]]
217 |     [--checkpoint CHECKPOINT]
218 |     [--batch_size BATCH_SIZE]
219 |     [--gpu GPU]
220 | 
221 | Determine which words in a speech file are emphasized
222 | 
223 | options:
224 |   -h, --help            show this help message and exit
225 |   --text_files TEXT_FILES [TEXT_FILES ...]
226 |                         The speech transcript text files
227 |   --audio_files AUDIO_FILES [AUDIO_FILES ...]
228 |                         The corresponding speech audio files
229 |   --output_files OUTPUT_FILES [OUTPUT_FILES ...]
230 |                         The output files. Default is text files with json suffix.
231 |   --checkpoint CHECKPOINT
232 |                         The model checkpoint to use for inference
233 |   --batch_size BATCH_SIZE
234 |                         The maximum number of frames per batch
235 |   --gpu GPU             The index of the gpu to run inference on
236 | ```
237 | 
238 | 
239 | ## Training
240 | 
241 | ### Download data
242 | 
243 | `python -m emphases.download --datasets <datasets>`.
244 | 
245 | Downloads and uncompresses datasets.
246 | 
247 | **N.B.** We omit Buckeye for public release. This evaluation dataset can be
248 | made by [downloading Buckeye](https://buckeyecorpus.osu.edu/) and matching
249 | the files to the
250 | [annotations](https://github.com/ProSD-Lab/Prominence-perception-in-English-French-Spanish/).
251 | The process of matching the files to the annotations was done for us and is
252 | tricky to replicate exactly. However, due to licensing restrictions on
253 | Buckeye, we cannot legally distribute our private, aligned annotations.
254 | 
255 | 
256 | ### Annotate data
257 | 
258 | Performing annotation requires first installing
259 | [Reproducible Subjective Evaluation (ReSEval)](https://github.com/reseval/reseval).
260 | 
261 | `python -m emphases.annotate --datasets <datasets>`
262 | 
263 | Launches a local web application to perform emphasis annotation, according to
264 | the ReSEval configuration file `emphases/assets/configs/annotate.yaml`.
265 | Requires ReSEval to be installed.
266 | 
267 | `python -m emphases.annotate --datasets <datasets> --remote --production`
268 | 
269 | Launches a crowdsourced emphasis annotation task, according to the ReSEval
270 | configuration file `emphases/assets/configs/annotate.yaml`. Requires ReSEval
271 | to be installed.
272 | 
273 | 
274 | ### Partition data
275 | 
276 | `python -m emphases.partition`
277 | 
278 | Generates `train`, `valid`, and `test` partitions for all datasets.
279 | Partitioning is deterministic given the same random seed. You do not need to
280 | run this step, as the original partitions are saved in
281 | `emphases/assets/partitions`.
282 | 
283 | 
284 | ### Preprocess
285 | 
286 | `python -m emphases.preprocess`
287 | 
288 | 
289 | ### Train
290 | 
291 | `python -m emphases.train --config <config> --dataset <dataset> --gpus <gpus>`
292 | 
293 | Trains a model according to a given configuration. Uses a list of GPU
294 | indices as an argument, and uses distributed data parallelism (DDP)
295 | if more than one index is given. For example, `--gpus 0 3` will train
296 | using DDP on GPUs `0` and `3`.
297 | 
298 | 
299 | ## Evaluation
300 | 
301 | ### Evaluate
302 | 
303 | `python -m emphases.evaluate --config <config> --checkpoint <checkpoint> --gpu <gpu>`
304 | 
305 | 
306 | ### Monitor
307 | 
308 | Run `tensorboard --logdir runs/`. If you are running training
309 | remotely, you must create a SSH connection with port forwarding to view
310 | Tensorboard. This can be done with `ssh -L 6006:localhost:6006
311 | <user>@<server-ip-address>`. Then, open `localhost:6006` in your browser.
312 | 
313 | 
314 | ## Citation
315 | 
316 | ### IEEE
317 | M. Morrison, P. Pawar, N. Pruyne, J. Cole, and B. Pardo, "Crowdsourced and Automatic Speech Prominence Estimation," International Conference on Acoustics, Speech, & Signal Processing, 2024.
318 | 
319 | 
320 | ### BibTex
321 | 
322 | ```
323 | @inproceedings{morrison2024crowdsourced,
324 |     title={Crowdsourced and Automatic Speech Prominence Estimation},
325 |     author={Morrison, Max and Pawar, Pranav and Pruyne, Nathan and Cole, Jennifer and Pardo, Bryan},
326 |     booktitle={International Conference on Acoustics, Speech, & Signal Processing},
327 |     year={2024}
328 | }
329 | 


--------------------------------------------------------------------------------
/config/base.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'base'
5 | 


--------------------------------------------------------------------------------
/config/baselines/duration-variance.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'duration-variance'
5 | 
6 | # Method to use for inference
7 | METHOD = 'duration-variance'
8 | 


--------------------------------------------------------------------------------
/config/baselines/pitch-variance.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'pitch-variance'
5 | 
6 | # Method to use for inference
7 | METHOD = 'pitch-variance'
8 | 


--------------------------------------------------------------------------------
/config/baselines/prominence.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'prominence'
5 | 
6 | # Method to use for inference
7 | METHOD = 'prominence'
8 | 


--------------------------------------------------------------------------------
/config/downsample/average-inference.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'average-inference'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'inference'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'average'
13 | 


--------------------------------------------------------------------------------
/config/downsample/average-input.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'average-input'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'input'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'average'
13 | 


--------------------------------------------------------------------------------
/config/downsample/average-intermediate.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'average-intermediate'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'intermediate'
 9 | 
10 | # Location to perform resampling from frame resolution to word resolution.
11 | # One of ['inference', 'input', 'intermediate', 'loss'].
12 | DOWNSAMPLE_LOCATION = 'input'
13 | 


--------------------------------------------------------------------------------
/config/downsample/average-loss.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'average-loss'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'loss'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max'].
12 | DOWNSAMPLE_METHOD = 'average'
13 | 


--------------------------------------------------------------------------------
/config/downsample/center-inference.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'center-inference'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'inference'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'center'
13 | 


--------------------------------------------------------------------------------
/config/downsample/center-input.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'center-input'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'input'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'center'
13 | 


--------------------------------------------------------------------------------
/config/downsample/center-intermediate.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'center-intermediate'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'intermediate'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'center'
13 | 


--------------------------------------------------------------------------------
/config/downsample/center-loss.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'center-loss'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'loss'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'center'
13 | 


--------------------------------------------------------------------------------
/config/downsample/max-inference.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'max-inference'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'inference'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'max'
13 | 


--------------------------------------------------------------------------------
/config/downsample/max-input.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'max-input'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'input'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'max'
13 | 


--------------------------------------------------------------------------------
/config/downsample/max-intermediate.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'max-intermediate'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'intermediate'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'max'
13 | 


--------------------------------------------------------------------------------
/config/downsample/max-loss.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'max-loss'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'loss'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'max'
13 | 


--------------------------------------------------------------------------------
/config/downsample/sum-inference.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'sum-inference'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'inference'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'sum'
13 | 


--------------------------------------------------------------------------------
/config/downsample/sum-input.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'sum-input'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'input'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'sum'
13 | 


--------------------------------------------------------------------------------
/config/downsample/sum-intermediate.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'sum-intermediate'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'intermediate'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'sum'
13 | 


--------------------------------------------------------------------------------
/config/downsample/sum-loss.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'sum-loss'
 5 | 
 6 | # Location to perform resampling from frame resolution to word resolution.
 7 | # One of ['inference', 'input', 'intermediate', 'loss'].
 8 | DOWNSAMPLE_LOCATION = 'loss'
 9 | 
10 | # Method to use for resampling from frame resolution to word resolution.
11 | # One of ['average', 'center', 'max', 'sum'].
12 | DOWNSAMPLE_METHOD = 'sum'
13 | 


--------------------------------------------------------------------------------
/config/hparam-search/batch-060000.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'batch-060000'
5 | 
6 | # Maximum number of frames in one batch
7 | MAX_TRAINING_FRAMES = 60000
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/batch-070000.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'batch-070000'
5 | 
6 | # Maximum number of frames in one batch
7 | MAX_TRAINING_FRAMES = 70000
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/batch-075000.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'batch-075000'
5 | 
6 | # Maximum number of frames in one batch
7 | MAX_TRAINING_FRAMES = 75000
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/batch-100000.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'batch-100000'
5 | 
6 | # Maximum number of frames in one batch
7 | MAX_TRAINING_FRAMES = 100000
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/buckets-1.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'buckets-1'
5 | 
6 | # Number of buckets of data lengths used by the sampler
7 | BUCKETS = 1
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/buckets-2.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'buckets-2'
5 | 
6 | # Number of buckets of data lengths used by the sampler
7 | BUCKETS = 2
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/convolution-5-80.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'convolution-5-80'
 5 | 
 6 | # Model width
 7 | CHANNELS = 80
 8 | 
 9 | # Number of network layers
10 | LAYERS = 5
11 | 


--------------------------------------------------------------------------------
/config/hparam-search/convolution-6-128.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'convolution-6-128'
 5 | 
 6 | # Model width
 7 | CHANNELS = 128
 8 | 
 9 | # Number of network layers
10 | LAYERS = 6
11 | 


--------------------------------------------------------------------------------
/config/hparam-search/convolution-6-64.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'convolution-6-64'
 5 | 
 6 | # Model width
 7 | CHANNELS = 64
 8 | 
 9 | # Number of network layers
10 | LAYERS = 6
11 | 


--------------------------------------------------------------------------------
/config/hparam-search/convolution-6-80.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'convolution-6-80'
 5 | 
 6 | # Model width
 7 | CHANNELS = 80
 8 | 
 9 | # Number of network layers
10 | LAYERS = 6
11 | 


--------------------------------------------------------------------------------
/config/hparam-search/convolution-7-80.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = 'convolution-7-80'
 5 | 
 6 | # Model width
 7 | CHANNELS = 80
 8 | 
 9 | # Number of network layers
10 | LAYERS = 7
11 | 


--------------------------------------------------------------------------------
/config/hparam-search/decoder-kernel-1.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'decoder-kernel-1'
5 | 
6 | # Decoder convolution kernel size
7 | DECODER_KERNEL_SIZE = 1
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/decoder-kernel-5.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'decoder-kernel-5'
5 | 
6 | # Decoder convolution kernel size
7 | DECODER_KERNEL_SIZE = 5
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/dropout-05.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'dropout-05'
5 | 
6 | # Dropout probability (or None to not use dropout)
7 | DROPOUT = .05
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/dropout-10.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'dropout-10'
5 | 
6 | # Dropout probability (or None to not use dropout)
7 | DROPOUT = .1
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/encoder-kernel-5.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'encoder-kernel-5'
5 | 
6 | # Encoder convolution kernel size
7 | ENCODER_KERNEL_SIZE = 5
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/encoder-kernel-7.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'encoder-kernel-7'
5 | 
6 | # Encoder convolution kernel size
7 | ENCODER_KERNEL_SIZE = 7
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/gelu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | MODULE = 'emphases'
 4 | 
 5 | # Configuration name
 6 | CONFIG = 'gelu'
 7 | 
 8 | # Activation function to use in convolution model
 9 | ACTIVATION_FUNCTION = torch.nn.GELU
10 | 


--------------------------------------------------------------------------------
/config/hparam-search/leaky-relu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | MODULE = 'emphases'
 4 | 
 5 | # Configuration name
 6 | CONFIG = 'leaky-relu'
 7 | 
 8 | # Activation function to use in convolution model
 9 | ACTIVATION_FUNCTION = torch.nn.LeakyReLU
10 | 


--------------------------------------------------------------------------------
/config/hparam-search/mse.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'mse'
5 | 
6 | # Loss function. One of ['bce', 'mse']
7 | LOSS = 'mse'
8 | 


--------------------------------------------------------------------------------
/config/hparam-search/silu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | MODULE = 'emphases'
 4 | 
 5 | # Configuration name
 6 | CONFIG = 'silu'
 7 | 
 8 | # Activation function to use in convolution model
 9 | ACTIVATION_FUNCTION = torch.nn.SiLU
10 | 


--------------------------------------------------------------------------------
/config/scaling/16-2.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '16-2'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of allowed annotations
10 | MAX_ANNOTATIONS = 2
11 | 
12 | # Maximum number of training utterances
13 | MAX_TRAINING_UTTERANCES = 1600
14 | 
15 | # Minimum number of allowed annotations
16 | MIN_ANNOTATIONS = 2
17 | 
18 | # Whether to use the specified one-eighth dataset for scaling law experiments
19 | ONE_EIGHTH_UTTERANCES = True
20 | 
21 | # Dataset to use for validation
22 | VALIDATION_DATASET = 'buckeye'
23 | 


--------------------------------------------------------------------------------
/config/scaling/1600.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '1600'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of training utterances
10 | MAX_TRAINING_UTTERANCES = 1600
11 | 
12 | # Whether to use the specified one-eighth dataset for scaling law experiments
13 | ONE_EIGHTH_UTTERANCES = True
14 | 
15 | # Dataset to use for validation
16 | VALIDATION_DATASET = 'buckeye'
17 | 


--------------------------------------------------------------------------------
/config/scaling/32-4.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '32-4'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of allowed annotations
10 | MAX_ANNOTATIONS = 4
11 | 
12 | # Maximum number of training utterances
13 | MAX_TRAINING_UTTERANCES = 800
14 | 
15 | # Minimum number of allowed annotations
16 | MIN_ANNOTATIONS = 4
17 | 
18 | # Whether to use the specified one-eighth dataset for scaling law experiments
19 | ONE_EIGHTH_UTTERANCES = True
20 | 
21 | # Dataset to use for validation
22 | VALIDATION_DATASET = 'buckeye'
23 | 


--------------------------------------------------------------------------------
/config/scaling/3200.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '3200'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of training utterances
10 | MAX_TRAINING_UTTERANCES = 3200
11 | 
12 | # Whether to use the specified one-eighth dataset for scaling law experiments
13 | ONE_EIGHTH_UTTERANCES = True
14 | 
15 | # Dataset to use for validation
16 | VALIDATION_DATASET = 'buckeye'
17 | 


--------------------------------------------------------------------------------
/config/scaling/400.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '400'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of training utterances
10 | MAX_TRAINING_UTTERANCES = 400
11 | 
12 | # Whether to use the specified one-eighth dataset for scaling law experiments
13 | ONE_EIGHTH_UTTERANCES = True
14 | 
15 | # Dataset to use for validation
16 | VALIDATION_DATASET = 'buckeye'
17 | 


--------------------------------------------------------------------------------
/config/scaling/64-8.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '64-8'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of allowed annotations
10 | MAX_ANNOTATIONS = 8
11 | 
12 | # Maximum number of training utterances
13 | MAX_TRAINING_UTTERANCES = 400
14 | 
15 | # Minimum number of allowed annotations
16 | MIN_ANNOTATIONS = 8
17 | 
18 | # Whether to use the specified one-eighth dataset for scaling law experiments
19 | ONE_EIGHTH_UTTERANCES = True
20 | 
21 | # Dataset to use for validation
22 | VALIDATION_DATASET = 'buckeye'
23 | 


--------------------------------------------------------------------------------
/config/scaling/8-1.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '8-1'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of allowed annotations
10 | MAX_ANNOTATIONS = 1
11 | 
12 | # Maximum number of training utterances
13 | MAX_TRAINING_UTTERANCES = 3200
14 | 
15 | # Minimum number of allowed annotations
16 | MIN_ANNOTATIONS = 1
17 | 
18 | # Whether to use the specified one-eighth dataset for scaling law experiments
19 | ONE_EIGHTH_UTTERANCES = True
20 | 
21 | # Dataset to use for validation
22 | VALIDATION_DATASET = 'buckeye'
23 | 


--------------------------------------------------------------------------------
/config/scaling/800.py:
--------------------------------------------------------------------------------
 1 | MODULE = 'emphases'
 2 | 
 3 | # Configuration name
 4 | CONFIG = '800'
 5 | 
 6 | # Datasets to use for evaluation
 7 | EVALUATION_DATASETS = ['buckeye']
 8 | 
 9 | # Maximum number of training utterances
10 | MAX_TRAINING_UTTERANCES = 800
11 | 
12 | # Whether to use the specified one-eighth dataset for scaling law experiments
13 | ONE_EIGHTH_UTTERANCES = True
14 | 
15 | # Dataset to use for validation
16 | VALIDATION_DATASET = 'buckeye'
17 | 


--------------------------------------------------------------------------------
/config/scaling/base-automatic.py:
--------------------------------------------------------------------------------
1 | MODULE = 'emphases'
2 | 
3 | # Configuration name
4 | CONFIG = 'base-automatic'
5 | 


--------------------------------------------------------------------------------
/data/cache/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/data/cache/.gitkeep


--------------------------------------------------------------------------------
/data/datasets/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/data/datasets/.gitkeep


--------------------------------------------------------------------------------
/data/sources/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/data/sources/.gitkeep


--------------------------------------------------------------------------------
/emphases/__init__.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Configuration
 3 | ###############################################################################
 4 | 
 5 | 
 6 | # Default configuration parameters to be modified
 7 | from .config import defaults
 8 | 
 9 | # Modify configuration
10 | import yapecs
11 | yapecs.configure('emphases', defaults)
12 | 
13 | # Import configuration parameters
14 | from .config.defaults import *
15 | from .config.static import *
16 | 
17 | 
18 | ###############################################################################
19 | # Module imports
20 | ###############################################################################
21 | 
22 | 
23 | from .core import *
24 | from .model import Model
25 | from . train import loss, train
26 | from . import annotate
27 | from . import baselines
28 | from . import convert
29 | from . import data
30 | from . import evaluate
31 | from . import load
32 | from . import partition
33 | from . import plot
34 | 


--------------------------------------------------------------------------------
/emphases/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import emphases
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Determine which words in a speech file are emphasized
 9 | ###############################################################################
10 | 
11 | 
12 | def parse_args():
13 |     """Parse command-line arguments"""
14 |     parser = argparse.ArgumentParser(
15 |         description='Determine which words in a speech file are emphasized')
16 |     parser.add_argument(
17 |         '--text_files',
18 |         type=Path,
19 |         nargs='+',
20 |         required=True,
21 |         help='The speech transcript (.txt) or alignment (.TextGrid) files')
22 |     parser.add_argument(
23 |         '--audio_files',
24 |         type=Path,
25 |         nargs='+',
26 |         required=True,
27 |         help='The corresponding speech audio files')
28 |     parser.add_argument(
29 |         '--output_prefixes',
30 |         type=Path,
31 |         nargs='+',
32 |         required=False,
33 |         help='output_prefixes: The output files. Defaults to text files stems.')
34 |     parser.add_argument(
35 |         '--checkpoint',
36 |         type=Path,
37 |         help='The model checkpoint to use for inference')
38 |     parser.add_argument(
39 |         '--batch_size',
40 |         type=int,
41 |         help='The maximum number of frames per batch')
42 |     parser.add_argument(
43 |         '--gpu',
44 |         type=int,
45 |         help='The index of the gpu to run inference on')
46 |     return parser.parse_args()
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     emphases.from_files_to_files(**vars(parse_args()))
51 | 


--------------------------------------------------------------------------------
/emphases/annotate/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | 


--------------------------------------------------------------------------------
/emphases/annotate/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import emphases
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Annotate emphases
 9 | ###############################################################################
10 | 
11 | 
12 | def parse_args():
13 |     """Parse command-line arguments"""
14 |     parser = argparse.ArgumentParser(description='Perform emphasis annotation')
15 |     parser.add_argument(
16 |         '--annotation_config',
17 |         type=Path,
18 |         default=emphases.DEFAULT_ANNOTATION_CONFIG,
19 |         help='The ReSEval configuration file for the annotation task')
20 |     parser.add_argument(
21 |         '--dataset',
22 |         default='libritts',
23 |         help='The dataset to annotate')
24 |     parser.add_argument(
25 |         '--directory',
26 |         type=Path,
27 |         default=emphases.ANNOTATION_DIR,
28 |         help='The directory to save results to')
29 |     parser.add_argument(
30 |         '--remote',
31 |         action='store_true',
32 |         help='Run subjective evaluation remotely')
33 |     parser.add_argument(
34 |         '--production',
35 |         action='store_true',
36 |         help='Deploy the subjective evaluation to crowdsource participants')
37 |     parser.add_argument(
38 |         '--interval',
39 |         type=int,
40 |         default=120,
41 |         help='The time between monitoring updates in seconds')
42 |     return parser.parse_args()
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     emphases.annotate.datasets(**vars(parse_args()))
47 | 


--------------------------------------------------------------------------------
/emphases/annotate/core.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | import shutil
 4 | 
 5 | import pypar
 6 | import reseval
 7 | 
 8 | import emphases
 9 | 
10 | 
11 | ###############################################################################
12 | # Annotate emphases
13 | ###############################################################################
14 | 
15 | 
16 | def datasets(
17 |     annotation_config=emphases.DEFAULT_ANNOTATION_CONFIG,
18 |     dataset='libritts',
19 |     directory=emphases.ANNOTATION_DIR,
20 |     remote=False,
21 |     production=False,
22 |     interval=120):
23 |     """Perform emphasis annotation on datasets"""
24 |     # Create input and output directories
25 |     directory.mkdir(exist_ok=True, parents=True)
26 |     index = f'{len(list(directory.glob("*"))):02}'
27 |     input_directory = directory / index
28 |     input_directory.mkdir(exist_ok=True, parents=True)
29 |     output_directory = emphases.DATA_DIR / 'crowdsource' / index
30 |     output_directory.mkdir(exist_ok=True, parents=True)
31 | 
32 |     # Get audio files
33 |     cache_directory = emphases.CACHE_DIR / dataset
34 |     audio_files = sorted(list(cache_directory.rglob('*.wav')))
35 | 
36 |     # Deterministic shuffle
37 |     random.seed(emphases.RANDOM_SEED)
38 |     random.shuffle(audio_files)
39 | 
40 |     # Iterate over audio files
41 |     for audio_file in audio_files:
42 | 
43 |         # Save audio
44 |         shutil.copyfile(audio_file, input_directory / audio_file.name)
45 | 
46 |         # Load alignment
47 |         alignment = pypar.Alignment(
48 |             cache_directory /
49 |             'alignment' /
50 |             f'{audio_file.stem}.TextGrid')
51 | 
52 |         # Save text
53 |         text_file = input_directory / f'{audio_file.stem}-words.txt'
54 |         with open(text_file, 'w') as file:
55 |             file.write(
56 |                 ' '.join([
57 |                     str(word) for word in alignment
58 |                     if str(word) != pypar.SILENCE]))
59 | 
60 |     # Run annotation
61 |     reseval.run(
62 |         annotation_config,
63 |         input_directory,
64 |         output_directory,
65 |         not remote,
66 |         production,
67 |         interval)
68 | 


--------------------------------------------------------------------------------
/emphases/assets/checkpoints/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/assets/checkpoints/.gitkeep


--------------------------------------------------------------------------------
/emphases/assets/checkpoints/checkpoint.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/assets/checkpoints/checkpoint.pt


--------------------------------------------------------------------------------
/emphases/assets/configs/annotate.yaml:
--------------------------------------------------------------------------------
  1 | # A name to give to this evaluation configuration
  2 | name: emphasis-annotation
  3 | 
  4 | # The type of test to run. One of [ab, abx, mos, mushra, wordselect].
  5 | test: wordselect
  6 | 
  7 | # The type of data to use. One of [audio, image, text, video].
  8 | datatype: audio
  9 | 
 10 | # The location to store files used for evaluation. One of [aws].
 11 | storage: aws
 12 | 
 13 | # The third-party platform hosting the MySQL database. One of [aws, heroku].
 14 | database: aws
 15 | 
 16 | # The third-party platform hosting the server. One of [aws, heroku].
 17 | server: aws
 18 | 
 19 | # Crowdsourcing configuration
 20 | crowdsource:
 21 | 
 22 |   # The crowdsourcing platform used for evaluation. One of [mturk].
 23 |   platform: mturk
 24 | 
 25 |   # The survey title shown to potential participants
 26 |   title: Emphasis annotation
 27 | 
 28 |   # The survey description shown to potential participants
 29 |   description: "Participate in a research study by listening to English speech and selecting emphasized words. Requires headphones and a quiet listening environment to pass listening test prescreening. $2.89 bonus on completion. Estimated 15 minutes ($13.35 / hour)."
 30 | 
 31 |   # Keywords that participants can use to find your survey
 32 |   keywords: annotate, audio, emphasis, headphones, speech
 33 | 
 34 |   # Filter participants
 35 |   filter:
 36 | 
 37 |     # Only allow participants from a certain countries
 38 |     countries: ['US']
 39 | 
 40 |     # Only allow participants who have previously completed at least this
 41 |     # number of tasks
 42 |     approved_tasks: 1000
 43 | 
 44 |     # Only allow participants who have a sufficiently high acceptance rating
 45 |     approval_rating: 99
 46 | 
 47 |   # How much you pay participants (in US dollars)
 48 |   # E.g., 2.00 is two dollars; 0.50 is fifty cents
 49 |   payment:
 50 | 
 51 |     # The amount that you pay even if they don't pass prescreening
 52 |     base: 0.45
 53 | 
 54 |     # The additional amount that you pay participants who complete evaluation
 55 |     completion: 2.89
 56 | 
 57 |   # How long to wait for things (in seconds)
 58 |   duration:
 59 | 
 60 |     # Total lifespan of the evaluation, after which the evaluation is no
 61 |     # longer available for participants to take
 62 |     total: 604800
 63 | 
 64 |     # The maximum time you will allow a participant to spend on your task
 65 |     assignment: 5400
 66 | 
 67 |     # Duration after which payment is automatically made
 68 |     autoapprove: 172800
 69 | 
 70 | # The number of participants
 71 | participants: 10
 72 | 
 73 | # The number of evaluations each participant performs
 74 | samples_per_participant: 20
 75 | 
 76 | # A seed to use for deterministic random sampling
 77 | random_seed: 0
 78 | 
 79 | # Introduction text to display on the first page participants visit
 80 | # N.B. This is not the actual IRB-approved survey text used in our studies,
 81 | # as we do not want others claiming to be part of our IRB-approved study.
 82 | welcome_text: "
 83 |   # **Welcome!**\n
 84 |   We are conducting a research study to evaluate the
 85 |   quality of an audio processing algorithm. If you agree to participate, you
 86 |   will be asked to fill out a brief questionnaire. You will then be asked to
 87 |   evaluate a series of audio samples.\n
 88 |   ### **Privacy**\nThis survey is completely anonymous. We will NOT collect
 89 |   any personally identifiable information. Your participation in this study
 90 |   does not involve any risk to you beyond that of your everyday life.\n
 91 |   ### **Consent**\nBy pressing **I Agree**, you confirm you are willing
 92 |   to participate in this research. However, you are free to withdraw your
 93 |   participation at any time.\n
 94 |   ### **Contact Information**\nIf you have any questions or feedback,
 95 |   please contact <contact info>."
 96 | 
 97 | # Questions that participants must answer before they are permitted to
 98 | # perform evaluation. If a multiple choice question has correct_answer
 99 | # defined, the participant must select that answer to be able to continue
100 | # to the evaluation.
101 | prescreen_questions: []
102 | 
103 | # Include an audio listening test
104 | listening_test:
105 | 
106 |   # Listening test instructions
107 |   instructions: "
108 |     ## **Instructions** \nMake sure your headphones are on and your volume
109 |     is turned up to a comfortable level. Listen to the audio. Then, select
110 |     how many tones you heard."
111 | 
112 |   # Number of questions to include on the listening test
113 |   num_questions: 2
114 | 
115 |   # Number of allowed retries before the participant fails the test
116 |   retries: 2
117 | 
118 | # Instructions presented to the participant during evaluation
119 | survey_instructions: "
120 | 
121 |   ## **Instructions** \nListen to the audio file a minimum of two times.
122 |   Select the words that were emphasized by the speaker. The emphasized
123 |   words are those that stand out from nearby words. Play the audio and then
124 |   click on a word to select (boldface) or deselect it."
125 | 
126 | # Questions presented to the participant after evaluation
127 | followup_questions:
128 | 
129 |   # Ask participant for their native language
130 |   - name: Language
131 | 
132 |     # The type of question. One of [free-response, multiple-choice].
133 |     type: multiple-choice
134 | 
135 |     # Question text
136 |     text: What is your native language?
137 | 
138 |     # Possible answers
139 |     answers: [
140 |       'Albanian',
141 |       'Amharic',
142 |       'Arabic',
143 |       'Bengali',
144 |       'Berber',
145 |       'Creole',
146 |       'Dari',
147 |       'Dzongkha',
148 |       'English',
149 |       'Farsi',
150 |       'Filipino',
151 |       'French',
152 |       'German',
153 |       'Gujarati',
154 |       'Hakka',
155 |       'Hausa',
156 |       'Hebrew',
157 |       'Hindi',
158 |       'Hokkien',
159 |       'Indonesian',
160 |       'Italian',
161 |       'Japanese',
162 |       'Javanese',
163 |       'Kannada',
164 |       'Korean',
165 |       'Mandarin Chinese',
166 |       'Marathi',
167 |       'Nepali',
168 |       'Nigerian Pidgin',
169 |       'Oromo',
170 |       'Pashto',
171 |       'Patois',
172 |       'Polish',
173 |       'Portuguese',
174 |       'Russian',
175 |       'Spanish',
176 |       'Swahili',
177 |       'Somali',
178 |       'Tagalog',
179 |       'Tamil',
180 |       'Telugu',
181 |       'Thai',
182 |       'Turkish',
183 |       'Ukrainian',
184 |       'Urdu',
185 |       'Uzbek',
186 |       'Vietnamese',
187 |       'Western Punjabi',
188 |       'Wu Chinese',
189 |       'Yue Chinese',
190 |       'Other']
191 | 
192 |   # Ask participant for their country of origin
193 |   - name: Country
194 | 
195 |     # The type of question. One of [free-response, multiple-choice].
196 |     type: multiple-choice
197 | 
198 |     # Question text
199 |     text: What country/region did you live in during your childhood?
200 | 
201 |     # Possible answers
202 |     answers: [
203 |       'Afghanistan',
204 |       'Albania',
205 |       'Argentina',
206 |       'Bangladesh',
207 |       'Bhutan',
208 |       'Brazil',
209 |       'Cameroon',
210 |       'Canada',
211 |       'China',
212 |       'Colombia',
213 |       'Cuba',
214 |       'Dominican Republic',
215 |       'Ecuador',
216 |       'Egypt',
217 |       'El Salvador',
218 |       'Ethiopia',
219 |       'France',
220 |       'Germany',
221 |       'Ghana',
222 |       'Guatemala',
223 |       'Guyana',
224 |       'Haiti',
225 |       'Honduras',
226 |       'India',
227 |       'Iran',
228 |       'Iraq',
229 |       'Israel',
230 |       'Jamaica',
231 |       'Japan',
232 |       'Jordan',
233 |       'Kenya',
234 |       'Mexico',
235 |       'Morocco',
236 |       'Nepal',
237 |       'Nicaragua',
238 |       'Nigeria',
239 |       'Pakistan',
240 |       'Peru',
241 |       'Philippines',
242 |       'Poland',
243 |       'Russia',
244 |       'Somalia',
245 |       'South Korea',
246 |       'Syria',
247 |       'Taiwan',
248 |       'Thailand',
249 |       'Turkey',
250 |       'Ukraine',
251 |       'United Kingdom',
252 |       'United States',
253 |       'Uzbekistan',
254 |       'Venezuela',
255 |       'Vietnam',
256 |       'Yemen',
257 |       'Other']
258 | 


--------------------------------------------------------------------------------
/emphases/assets/partitions/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/assets/partitions/.gitkeep


--------------------------------------------------------------------------------
/emphases/assets/partitions/buckeye.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train": [],
 3 |     "valid": [],
 4 |     "test": [
 5 |         "s25-1",
 6 |         "s04-1",
 7 |         "s16-1",
 8 |         "s26-1",
 9 |         "s02-1",
10 |         "s03-1",
11 |         "s22-1",
12 |         "s32-1",
13 |         "s21-1",
14 |         "s24-1",
15 |         "s17-1",
16 |         "s14-1",
17 |         "s11-1"
18 |     ]
19 | }


--------------------------------------------------------------------------------
/emphases/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | from . import prominence
2 | from . import duration_variance
3 | from . import pitch_variance
4 | 


--------------------------------------------------------------------------------
/emphases/baselines/duration_variance/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | 


--------------------------------------------------------------------------------
/emphases/baselines/duration_variance/core.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | ###############################################################################
 5 | # Duration variance baseline method
 6 | ###############################################################################
 7 | 
 8 | 
 9 | def infer(alignment):
10 |     """Compute per-word emphasis scores using duration variance method"""
11 |     # Average duration of phonemes in the sentence
12 |     average_duration = alignment.duration() / len(alignment.phonemes())
13 | 
14 |     # Average duration of phonemes in each word
15 |     average_duration_per_word = torch.tensor([
16 |         word.duration() / len(word) for word in alignment])
17 | 
18 |     # Zero-center
19 |     return (average_duration_per_word - average_duration)[None]
20 | 


--------------------------------------------------------------------------------
/emphases/baselines/pitch_variance/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | 


--------------------------------------------------------------------------------
/emphases/baselines/pitch_variance/core.py:
--------------------------------------------------------------------------------
 1 | import penn
 2 | import torch
 3 | 
 4 | import emphases
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Pitch variance method
 9 | ###############################################################################
10 | 
11 | 
12 | def infer(alignment, audio, sample_rate, gpu=None):
13 |     """Compute per-word emphasis scores using pitch variance method"""
14 |     # Infer pitch and periodicity
15 |     pitch, _ = penn.from_audio(
16 |         audio,
17 |         sample_rate,
18 |         hopsize=emphases.HOPSIZE_SECONDS,
19 |         fmin=emphases.FMIN,
20 |         fmax=emphases.FMAX,
21 |         pad=True,
22 |         interp_unvoiced_at=emphases.VOICED_THRESHOLD,
23 |         gpu=gpu)
24 | 
25 |     # Compute pitch statistics in base-two log-space
26 |     pitch = torch.log2(pitch)
27 | 
28 |     # Compute utterance statistics
29 |     utterance_spread = spread(pitch)
30 | 
31 |     # Compute word statistics
32 |     word_spreads = []
33 |     for word in alignment:
34 |         start = int(emphases.convert.seconds_to_frames(word.start()))
35 |         end = int(emphases.convert.seconds_to_frames(word.end()))
36 |         word_spreads.append(spread(pitch[0, start:end]))
37 |     word_spreads = torch.tensor(
38 |         word_spreads,
39 |         dtype=pitch.dtype,
40 |         device=pitch.device)[None]
41 | 
42 |     # Zero-center
43 |     return word_spreads - utterance_spread
44 | 
45 | 
46 | ###############################################################################
47 | # Utilities
48 | ###############################################################################
49 | 
50 | 
51 | def spread(pitch):
52 |     """Compute pitch spread"""
53 |     return torch.quantile(pitch, .95) - torch.quantile(pitch, .05)
54 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/__init__.py:
--------------------------------------------------------------------------------
 1 | from .core import *
 2 | from . import cwt_utils
 3 | from . import duration_processing
 4 | from . import energy_processing
 5 | from . import f0_processing
 6 | from . import filter
 7 | from . import loma
 8 | from . import pitch_tracker
 9 | from . import smooth_and_interp
10 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/core.py:
--------------------------------------------------------------------------------
  1 | import fractions
  2 | 
  3 | import torch
  4 | import numpy as np
  5 | from scipy.signal import resample_poly
  6 | 
  7 | import emphases
  8 | 
  9 | 
 10 | ###############################################################################
 11 | # Prominence API
 12 | ###############################################################################
 13 | 
 14 | 
 15 | def infer(alignment, audio, sample_rate):
 16 |     """Compute per-word prominence from alignment and audio"""
 17 |     # Convert to numpy
 18 |     audio = audio.numpy()[0]
 19 | 
 20 |     # Compute energy
 21 |     energy = emphases.baselines.prominence.energy_processing.extract_energy(
 22 |         audio,
 23 |         sample_rate)
 24 |     energy = np.cbrt(energy + 1)
 25 | 
 26 |     # Smooth energy
 27 |     energy = emphases.baselines.prominence.smooth_and_interp.peak_smooth(
 28 |         energy,
 29 |         30,
 30 |         3)
 31 |     energy = emphases.baselines.prominence.smooth_and_interp.smooth(energy, 10)
 32 | 
 33 |     # Compute pitch
 34 |     pitch = emphases.baselines.prominence.pitch_tracker.inst_freq_pitch(
 35 |         audio,
 36 |         sample_rate)
 37 |     pitch = emphases.baselines.prominence.f0_processing.process(pitch)
 38 | 
 39 |     # Extract duration
 40 |     duration = \
 41 |         emphases.baselines.prominence.duration_processing.get_duration_signal(
 42 |         alignment,
 43 |         weights=[.5, .5],
 44 |         rate=200)
 45 | 
 46 |     # Slice features
 47 |     min_length = np.min([len(pitch), len(energy), len(duration)])
 48 |     pitch = pitch[:min_length]
 49 |     energy = energy[:min_length]
 50 |     duration = duration[:min_length]
 51 | 
 52 |     # Combine features
 53 |     combined = (
 54 |         emphases.PROMINENCE_PITCH_WEIGHT * normalize(pitch) +
 55 |         emphases.PROMINENCE_ENERGY_WEIGHT * normalize(energy) +
 56 |         emphases.PROMINENCE_DURATION_WEIGHT * normalize(duration))
 57 |     combined = normalize(
 58 |         emphases.baselines.prominence.smooth_and_interp.remove_bias(
 59 |             combined,
 60 |             800))
 61 | 
 62 |     # Distance between adjacent scales (.25 means 4 scales per octave)
 63 |     scale_distance = .25  # octaves
 64 | 
 65 |     # Continuous wavelet transform analysis
 66 |     cwt, scales, freqs = emphases.baselines.prominence.cwt_utils.cwt_analysis(
 67 |         combined,
 68 |         mother_name='mexican_hat',
 69 |         period=3,
 70 |         num_scales=34,
 71 |         scale_distance=scale_distance,
 72 |         apply_coi=False)
 73 |     cwt = np.real(cwt)
 74 |     scales *= 200
 75 | 
 76 |     # Get scale that minimizes distance with average word length
 77 |     average_duration = (alignment.end() / len(alignment))*200
 78 |     scales = 1. / freqs * 200 * .5
 79 |     scale = np.argmin(np.abs(scales - average_duration))
 80 | 
 81 |     # Define the scale information
 82 |     pos_loma_start = scale + \
 83 |         int(emphases.LOMA_PROMINENCE_START / scale_distance)
 84 |     pos_loma_end = scale + \
 85 |         int(emphases.LOMA_PROMINENCE_END / scale_distance)
 86 |     neg_loma_start = scale + \
 87 |         int(emphases.LOMA_BOUNDARY_START / scale_distance)
 88 |     neg_loma_end = scale + \
 89 |         int(emphases.LOMA_BOUNDARY_END / scale_distance)
 90 | 
 91 |     # Retrieve line of maximum amplitude
 92 |     pos_loma = emphases.baselines.prominence.loma.get_loma(
 93 |         cwt,
 94 |         scales,
 95 |         pos_loma_start,
 96 |         pos_loma_end)
 97 |     neg_loma = emphases.baselines.prominence.loma.get_loma(
 98 |         -cwt,
 99 |         scales,
100 |         neg_loma_start,
101 |         neg_loma_end)
102 | 
103 |     # Decode prominence
104 |     max_loma = np.array(emphases.baselines.prominence.loma.get_prominences(
105 |         pos_loma,
106 |         alignment,
107 |         rate=200))
108 | 
109 |     # Prominence dimensions - [time, value]
110 |     prominences = torch.tensor(max_loma)
111 | 
112 |     # Decode boundaries
113 |     # Boundries dimensions - [time, value]
114 |     boundaries = torch.tensor(emphases.baselines.prominence.loma.get_boundaries(
115 |         max_loma,
116 |         neg_loma,
117 |         alignment))
118 | 
119 |     return prominences[:, 1][None]
120 | 
121 | 
122 | ###############################################################################
123 | # Utilities
124 | ###############################################################################
125 | 
126 | 
127 | def normalize(features):
128 |     """Normalize features"""
129 |     return (features - np.nanmean(features)) / (np.nanstd(features) + 1e-7)
130 | 
131 | 
132 | def resample(signal, original_sample_rate, target_sample_rate):
133 |     """Resample signal"""
134 |     ratio = fractions.Fraction(target_sample_rate, original_sample_rate)
135 |     return resample_poly(signal, ratio.numerator, ratio.denominator)
136 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/cwt_utils.py:
--------------------------------------------------------------------------------
  1 | from numpy import array, sqrt, pad, mean, pi
  2 | 
  3 | import pycwt as cwt
  4 | 
  5 | 
  6 | ###########################################################################################
  7 | # Private routines
  8 | ###########################################################################################
  9 | 
 10 | 
 11 | def _padded_cwt(params, dt, dj, s0, J, mother, padding_len):
 12 |     """Private function to compute a wavelet transform on padded data
 13 | 
 14 |     Parameters
 15 |     ----------
 16 |     params: arraylike
 17 |         The prosodic parameters.
 18 |     dt: ?
 19 |         ?
 20 |     dj: ?
 21 |         ?
 22 |     s0: ?
 23 |         ?
 24 |     J: ?
 25 |         ?
 26 |     mother: ?
 27 |         The mother wavelet.
 28 |     padding_len: int
 29 |         The padding length
 30 | 
 31 |     Returns
 32 |     -------
 33 |     wavelet_matrix: ndarray
 34 |     	The wavelet data resulting from the analysis
 35 |     scales: arraylike
 36 |     	The scale indices corresponding to the wavelet data
 37 |     freqs: ?
 38 |     	?
 39 |     coi: array
 40 |     	The cone of influence values
 41 |     fft: ?
 42 |     	?
 43 |     fftfreqs: ?
 44 |     	?
 45 |     """
 46 |     padded = pad(params, padding_len, mode='edge')
 47 |     wavelet_matrix, scales, freqs, coi, fft, fftfreqs = cwt.cwt(
 48 |         padded,
 49 |         dt,
 50 |         dj,
 51 |         s0,
 52 |         J,
 53 |         mother)
 54 |     wavelet_matrix = \
 55 |         wavelet_matrix[:, padding_len:len(wavelet_matrix[0]) - padding_len]
 56 |     return wavelet_matrix, scales, freqs, coi, fft, fftfreqs
 57 | 
 58 | 
 59 | def _zero_outside_coi(wavelet_matrix, freqs, rate=200):
 60 |     """Private function to set each elements outside of the Cone Of Influence (coi) to 0.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     wavelet_matrix: type
 65 |         description
 66 |     freqs: type
 67 |         description
 68 |     """
 69 |     for i in range(0, wavelet_matrix.shape[0]):
 70 |         coi = int(1. / freqs[i] * rate)
 71 |         wavelet_matrix[i, :coi] = 0.
 72 |         wavelet_matrix[i, -coi:] = 0.
 73 |     return wavelet_matrix
 74 | 
 75 | 
 76 | def _scale_for_reconstruction(
 77 |     wavelet_matrix,
 78 |     scales,
 79 |     dj,
 80 |     dt,
 81 |     mother='mexican_hat',
 82 |     period=3):
 83 |     """ ?
 84 | 
 85 |     Parameters
 86 |     ----------
 87 |     wavelet_matrix: ndarray
 88 |     	The wavelet data resulting from the analysis
 89 |     scales: arraylike
 90 |     	The scale indices corresponding to the wavelet data
 91 |     dj: ?
 92 |         ?
 93 |     dt: ?
 94 |         ?
 95 |     mother: ?
 96 |         ?
 97 |     period: ?
 98 |         ?
 99 |     """
100 |     scaled = array(wavelet_matrix)
101 | 
102 |     # mexican Hat
103 |     c = dj / (3.541 * .867)
104 | 
105 |     if mother == 'morlet':
106 |         cc = 1.83
107 |         #periods 5 and 6 are correct, 3,4 approximate
108 |         if period == 3:
109 |             cc = 1.74
110 |         if period == 4:
111 |             cc = 1.1
112 |         elif period == 5:
113 |             cc = .9484
114 |         elif period == 6:
115 |             cc = .7784
116 |         c = dj / (cc * pi ** (-.25))
117 | 
118 |     for i in range(0, len(scales)):
119 |         scaled[i] *= c * sqrt(dt) / sqrt(scales[i])
120 |         # substracting the mean should not be necessary?
121 |         scaled[i] -= mean(scaled[i])
122 | 
123 |     return scaled
124 | 
125 | 
126 | def cwt_analysis(
127 |     params,
128 |     mother_name='mexican_hat',
129 |     num_scales=12,
130 |     first_scale=None,
131 |     scale_distance=1.,
132 |     apply_coi=True,
133 |     period=5,
134 |     frame_rate=200):
135 |     """Achieve the continous wavelet analysis of given parameters
136 | 
137 |     Parameters
138 |     ----------
139 |     params: arraylike
140 |         The parameters to analyze.
141 |     mother_name: string, optional
142 |         The name of the mother wavelet [default: mexican_hat].
143 |     num_scales: int, optional
144 |         The number of scales [default: 12].
145 |     first_scale: int, optional
146 |         The width of the shortest scale
147 |     scale_distance: float, optional
148 |         The distance between scales [default: 1.0].
149 |     apply_coi: boolean, optional
150 |         Apply the Cone Of Influence (coi)
151 |     period: int, optional
152 |         The period of the mother wavelet [default: 5].
153 |     frame_rate: int, optional
154 |         The signal frame rate [default: 200].
155 | 
156 |     Returns
157 |     -------
158 |     wavelet_matrix: ndarray
159 |     	The wavelet data resulting from the analysis
160 |     scales: arraylike
161 |     	The scale indices corresponding to the wavelet data
162 |     """
163 |     # setup wavelet transform
164 |     dt = 1. / float(frame_rate)  # frame length
165 | 
166 |     if not first_scale:
167 |         first_scale = dt # first scale, here frame length
168 | 
169 |     dj = scale_distance  # distance between scales in octaves
170 |     J = num_scales #  number of scales
171 | 
172 |     mother = cwt.MexicanHat()
173 | 
174 |     if str.lower(mother_name) == 'morlet':
175 |         mother = cwt.Morlet(period)
176 | 
177 |     wavelet_matrix, scales, freqs, *_ = _padded_cwt(
178 |         params,
179 |         dt,
180 |         dj,
181 |         first_scale,
182 |         J,
183 |         mother,
184 |         400)
185 |     wavelet_matrix = _scale_for_reconstruction(
186 |         wavelet_matrix,
187 |         scales,
188 |         dj,
189 |         dt,
190 |         mother=mother_name,
191 |         period=period)
192 | 
193 |     if apply_coi:
194 |         wavelet_matrix = _zero_outside_coi(wavelet_matrix, freqs, frame_rate)
195 | 
196 |     return wavelet_matrix, scales, freqs
197 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/duration_processing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import emphases
  4 | 
  5 | 
  6 | ###############################################################################
  7 | # Constants
  8 | ###############################################################################
  9 | 
 10 | 
 11 | SILENCE_SYMBOLS = [
 12 |     '#',
 13 |     '!pau',
 14 |     'sp',
 15 |     '<s>',
 16 |     'pau',
 17 |     '!sil',
 18 |     'sil',
 19 |     '',
 20 |     ' ',
 21 |     '<p>',
 22 |     '<p:>',
 23 |     '.',
 24 |     ',',
 25 |     '?',
 26 |     '<silent>']
 27 | 
 28 | 
 29 | ###############################################################################
 30 | # Duration
 31 | ###############################################################################
 32 | 
 33 | 
 34 | def _get_dur_stats(labels, rate=200):
 35 |     durations = []
 36 |     for i in range(len(labels)):
 37 |         st, en, unit = labels[i]
 38 |         st *= rate
 39 |         en *= rate
 40 |         if unit.lower() not in SILENCE_SYMBOLS:
 41 |             dur = en - st
 42 |             dur = np.log(dur + 1.)
 43 |             durations.append(dur)
 44 |     durations = np.array(durations)
 45 |     return np.min(durations), np.max(durations), np.mean(durations)
 46 | 
 47 | 
 48 | def get_rate(params, hp=10, lp=150):
 49 |     """
 50 |     estimation of speech rate as a center of gravity of wavelet spectrum
 51 |     similar to method described in "Boundary Detection using Continuous Wavelet Analysis" (2016)
 52 |     """
 53 |     params = emphases.baselines.prominence.smooth_and_interp.smooth(params, hp)
 54 |     params -= emphases.baselines.prominence.smooth_and_interp.smooth(params, lp)
 55 | 
 56 |     wavelet_matrix, *_ = emphases.baselines.prominence.cwt_utils.cwt_analysis(
 57 |         params,
 58 |         mother_name='Morlet',
 59 |         num_scales=80,
 60 |         scale_distance=.1,
 61 |         apply_coi=True,
 62 |         period=2)
 63 |     wavelet_matrix = abs(wavelet_matrix)
 64 | 
 65 |     rate = np.zeros(len(params))
 66 | 
 67 |     for i in range(0,wavelet_matrix.shape[1]):
 68 |         frame_en = np.sum(wavelet_matrix[:, i])
 69 |         # center of gravity
 70 |         rate[i] = np.nonzero(
 71 |             wavelet_matrix[:, i].cumsum() >= frame_en * .5)[0].min()
 72 | 
 73 |     return emphases.baselines.prominence.smooth_and_interp.smooth(rate, 30)
 74 | 
 75 | 
 76 | def duration(labels, rate=200):
 77 |     """Construct duration signal from labels"""
 78 |     dur = np.zeros(len(labels))
 79 |     params = np.zeros(int(labels[-1][1] * rate))
 80 |     prev_end = 0
 81 |     min_dur, *_ = _get_dur_stats(labels, rate=200)
 82 |     for i in range(0, len(labels)):
 83 |         st, en, unit = labels[i]
 84 |         st *= rate
 85 |         en *= rate
 86 |         dur[i] = en - st
 87 |         dur[i] = np.log(dur[i] + 1.)
 88 | 
 89 |         if unit.lower() in SILENCE_SYMBOLS:
 90 |             dur[i] = min_dur
 91 | 
 92 |         # skip very short units, likely labelling errors
 93 |         if en <= st + .01:
 94 |             continue
 95 | 
 96 |         # unit duration -> height of the duration contour in the middle of the unit
 97 |         index = min(len(params) - 1, int(st + (en - st) / 2.))
 98 |         params[index] = dur[i]
 99 | 
100 |         # Handle gaps in labels similarly to silences
101 |         if st > prev_end and i > 1:
102 |             params[int(prev_end + (st - prev_end) / 2.)] = min_dur
103 |         prev_end = en
104 | 
105 |     # set endpoints to mean in order to avoid large "valleys"
106 |     params[0] = np.mean(dur)
107 |     params[-1] = np.mean(dur)
108 | 
109 |     # make continous duration contour and smooth a bit
110 |     params = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(params, 'pchip')
111 |     return emphases.baselines.prominence.smooth_and_interp.smooth(params, 20)
112 | 
113 | 
114 | def get_duration_signal(
115 |     alignment,
116 |     weights=[],
117 |     rate=1):
118 |     """
119 |     Construct duration contour from labels. If many tiers are selected,
120 |     construct contours for each tier and return a weighted sum of those
121 |     """
122 |     word_tier = [(word.start(), word.end(), str(word)) for word in alignment]
123 |     phoneme_tier = [
124 |         (phoneme.start(), phoneme.end(), str(phoneme))
125 |     for phoneme in alignment.phonemes()]
126 |     tiers = [phoneme_tier, word_tier]
127 | 
128 |     durations = []
129 | 
130 |     for tier in tiers:
131 |         durations.append(
132 |             emphases.baselines.prominence.normalize(
133 |                 duration(tier, rate=rate)))
134 |     durations = match_length(durations)
135 |     sum_durations = np.zeros(len(durations[0]))
136 |     if len(weights) != len(tiers):
137 |         weights = np.ones(len(tiers))
138 |     for i in range(len(durations)):
139 |         sum_durations += durations[i] * weights[i]
140 |     return sum_durations
141 | 
142 | 
143 | def match_length(sig_list):
144 |     """Reduce length of all signals to a the minimum one.
145 | 
146 |     Parameters
147 |     ----------
148 |     sig_list: list
149 |         List of signals which are 1D array of samples.
150 |     """
151 |     length = min(map(len, sig_list))
152 |     for i in range(0, len(sig_list)):
153 |         sig_list[i] = sig_list[i][:int(length)]
154 |     return sig_list
155 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/energy_processing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | def extract_energy(
 7 |     waveform,
 8 |     sample_rate=16000,
 9 |     min_freq=emphases.PROMINENCE_ENERGY_MIN,
10 |     max_freq=emphases.PROMINENCE_ENERGY_MAX,
11 |     frame_rate=200):
12 |     # Get butterworth bandpass filter parameters
13 |     lp_waveform =  emphases.baselines.prominence.filter.butter_bandpass_filter(
14 |         waveform,
15 |         min_freq,
16 |         max_freq,
17 |         sample_rate,
18 |         order=5)
19 | 
20 |     # Compute energy
21 |     energy = np.sqrt(lp_waveform ** 2)
22 | 
23 |     # Resample to frame rate
24 |     return emphases.baselines.prominence.resample(energy, sample_rate, frame_rate)
25 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/f0_processing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import emphases
  4 | 
  5 | 
  6 | def rolling_window(a, window):
  7 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
  8 |     strides = a.strides + (a.strides[-1],)
  9 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 10 | 
 11 | 
 12 | def _cut_boundary_vals(params, num_vals):
 13 |     cutted = np.array(params)
 14 |     for i in range(num_vals, len(params) - num_vals):
 15 |         if params[i] <= 0 and params[i + 1] > 0:
 16 |             for j in range(i, i + num_vals):
 17 |                 cutted[j] = 0.
 18 | 
 19 |         if params[i] > 0 and params[i + 1] <= 0:
 20 |             for j in range(i - num_vals, i + 1):
 21 |                 cutted[j] = 0.
 22 | 
 23 |     return cutted
 24 | 
 25 | 
 26 | def _remove_outliers(log_pitch):
 27 |     fixed = np.array(log_pitch)
 28 | 
 29 |     # Remove outlier f0 values from voicing boundaries
 30 |     boundary_cut = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(
 31 |         _cut_boundary_vals(fixed, 3),
 32 |         'linear')
 33 |     interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear')
 34 |     fixed[abs(interp - boundary_cut) > .1] = 0
 35 |     interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear')
 36 | 
 37 |     # iterative outlier removal
 38 |     # 1. compare current contour estimate to a smoothed contour and remove deviates larger than threshold
 39 |     # 2. smooth current estimate with shorter window, thighten threshold
 40 |     # 3. goto 1.
 41 | 
 42 |     # In practice, first handles large scale octave jump type errors,
 43 |     # finally small scale 'errors' like consonant perturbation effects and
 44 |     # other irregularities in voicing boundaries
 45 |     #
 46 |     # if this appears to remove too many correct values, increase thresholds
 47 |     num_iter = 30
 48 |     max_win_len = 100
 49 |     min_win_len = 10
 50 |     max_threshold = 3.  # threshold with broad window
 51 |     min_threshold = .5  # threshold with shorted window
 52 | 
 53 |     _std = np.std(interp)
 54 |     # do not tie fixing to liveliness of the original
 55 |     _std = .3
 56 | 
 57 |     win_len = np.exp(
 58 |         np.linspace(np.log(max_win_len), np.log(min_win_len), num_iter + 1))
 59 |     outlier_threshold = np.linspace(
 60 |         _std * max_threshold,
 61 |         _std * min_threshold,
 62 |         num_iter + 1)
 63 |     for i in range(0, num_iter):
 64 |         smooth_contour = emphases.baselines.prominence.smooth_and_interp.smooth(interp, win_len[i])
 65 |         low_limit = smooth_contour - outlier_threshold[i]
 66 |         # bit more careful upwards, not to cut emphases
 67 |         hi_limit = smooth_contour + outlier_threshold[i] * 1.5
 68 | 
 69 |         # octave jump down fix, more harm than good?
 70 |         fixed[interp > hi_limit] = 0
 71 |         fixed[interp < low_limit] = 0
 72 |         interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear')
 73 | 
 74 |     return fixed
 75 | 
 76 | 
 77 | def _interpolate(f0):
 78 |     interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(f0)
 79 |     _std = np.std(interp)
 80 |     _min = np.min(interp)
 81 |     low_limit = emphases.baselines.prominence.smooth_and_interp.smooth(interp, 200) - 1.5 * _std
 82 |     low_limit[low_limit < _min] = _min
 83 |     hi_limit = emphases.baselines.prominence.smooth_and_interp.smooth(interp, 100) + 2. * _std
 84 |     voicing = np.array(f0)
 85 |     constrained = np.array(f0)
 86 |     constrained = np.maximum(f0, low_limit)
 87 |     constrained = np.minimum(constrained, hi_limit)
 88 |     interp = emphases.baselines.prominence.smooth_and_interp.peak_smooth(
 89 |         constrained,
 90 |         100,
 91 |         20,
 92 |         voicing=voicing)
 93 |     # smooth voiced parts a bit too
 94 |     return emphases.baselines.prominence.smooth_and_interp.peak_smooth(interp, 3, 2)
 95 | 
 96 | 
 97 | def process(f0):
 98 |     log_pitch = np.array(f0)
 99 |     log_scaled = True
100 |     if np.mean(f0[f0 > 0]) > 20:
101 |         log_scaled = False
102 |         log_pitch[f0 > 0] = np.log(f0[f0 > 0])
103 |         log_pitch[f0 <= 0] = 0
104 | 
105 |     log_pitch = _remove_outliers(log_pitch)
106 |     log_pitch = _interpolate(log_pitch)
107 | 
108 |     if not log_scaled:
109 |         return np.exp(log_pitch)
110 |     else:
111 |         return log_pitch
112 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/filter.py:
--------------------------------------------------------------------------------
 1 | from scipy.signal import butter, lfilter
 2 | 
 3 | 
 4 | def butter_bandpass(lowcut, highcut, fs, order=5):
 5 |     """Generate the butter bandpass filter
 6 | 
 7 |     For more details see scipy.signal.butter documentation
 8 | 
 9 |     Parameters
10 |     ----------
11 |     lowcut: int
12 |         The low cut value
13 |     highcut: type
14 |         description
15 |     fs: int
16 |         Signal sample rate
17 |     order: int
18 |         Order of the butter fiter
19 | 
20 |     Returns
21 |     -------
22 |     b: arraylike
23 |     	Numerator polynomial of the IIR filter
24 |     a: arraylike
25 |     	Denominator polynomial of the IIR filter
26 |     """
27 |     nyq = .5 * fs
28 |     low = lowcut / nyq
29 |     if highcut >= nyq * .95:
30 |         highcut = nyq * .95
31 |     high = highcut / nyq
32 |     b, a = butter(order, [low, high], btype='band')
33 |     return b, a
34 | 
35 | 
36 | def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
37 |     """Filter signal data using a butter filter type
38 | 
39 |     For more details see scipy.signal.butter and scipy.signal.lfilter documentation
40 | 
41 |     Parameters
42 |     ----------
43 |     data: arraylike
44 |         An N-dimensional input array.
45 |     lowcut: int
46 |         The lowcut filtering value.
47 |     highcut: type
48 |         The highcut filtering value.
49 |     fs: int
50 |         The signal sample rate.
51 |     order: int
52 |         The order of the butter filter.
53 | 
54 |     Returns
55 |     -------
56 |     arraylike
57 |     	An N-dimensional filtered array
58 |     """
59 |     b, a = butter_bandpass(lowcut, highcut, fs, order=order)
60 |     return lfilter(b, a, data)
61 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/loma.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from operator import itemgetter
  3 | 
  4 | 
  5 | def simplify(loma):
  6 |     """?
  7 |     Parameters
  8 |     ----------
  9 |     loma: type
 10 |         description
 11 |     """
 12 |     simplified = []
 13 |     for l in loma:
 14 |         # align loma to it's position in the middle of the line
 15 |         pos =  l[int(len(l) / 2.)][0]
 16 |         strength = l[-1][1]
 17 |         simplified.append((pos, strength))
 18 |     return simplified
 19 | 
 20 | 
 21 | def get_prominences(pos_loma, alignment, rate=1):
 22 |     """?
 23 |     Parameters
 24 |     ----------
 25 |     pos_loma: list of ?
 26 |         Positive loma values
 27 |     labels: list of tuple (float, float, string)
 28 |         List of labels which are lists of 3 elements [start, end, description]
 29 |     """
 30 |     max_word_loma = []
 31 |     loma = simplify(pos_loma)
 32 |     for st, end in [(word.start(), word.end()) for word in alignment]:
 33 |         st *= rate
 34 |         end *= rate
 35 |         word_loma = []
 36 |         for l in loma:
 37 |             if l[0] >= st and l[0] <= end:
 38 |                 word_loma.append(l)
 39 |         if len(word_loma) > 0:
 40 |             max_word_loma.append(sorted(word_loma, key=itemgetter(1))[-1])
 41 |         else:
 42 |             max_word_loma.append([st + (end - st) / 2., 0.])
 43 |     return max_word_loma
 44 | 
 45 | 
 46 | def get_boundaries(max_word_loma, boundary_loma, alignment):
 47 |     """get strongest lines of minimum amplitude between adjacent words' max lines"""
 48 |     boundary_loma = simplify(boundary_loma)
 49 |     max_boundary_loma = []
 50 |     st = 0
 51 |     end = 0
 52 |     for i in range(1, len(max_word_loma)):
 53 |         w_boundary_loma = []
 54 |         for l in boundary_loma:
 55 |             st = max_word_loma[i - 1][0]
 56 |             end = max_word_loma[i][0]
 57 |             if l[0] >= st and l[0] < end:
 58 |                 if l[1] > 0:
 59 |                     w_boundary_loma.append(l)
 60 | 
 61 |         if len(w_boundary_loma) > 0:
 62 |             max_boundary_loma.append(
 63 |                 sorted(w_boundary_loma, key=itemgetter(1))[-1])
 64 |         else:
 65 |             max_boundary_loma.append([st + (end - st) / 2, 0])
 66 | 
 67 |     # final boundary is not estimated
 68 |     max_boundary_loma.append((alignment.end(), 1))
 69 | 
 70 |     return max_boundary_loma
 71 | 
 72 | 
 73 | def _get_parent(child_index, parent_diff, parent_indices):
 74 |     """Private function to find the parent of the given child peak. At child peak index, follow the
 75 |     slope of parent scale upwards to find parent
 76 | 
 77 |     Parameters
 78 |     ----------
 79 |     child_index: int
 80 |         Index of the current child peak
 81 |     parent_diff: list of ?
 82 |         ?
 83 |     parent_indices: list of int ?
 84 |         Indices of available parents
 85 | 
 86 |     Returns
 87 |     _______
 88 |     int
 89 |     	The parent index or None if there is no parent
 90 |     """
 91 |     for i in range(0, len(parent_indices)):
 92 |         if parent_indices[i] > child_index:
 93 |             if parent_diff[int(child_index)] > 0:
 94 |                 return parent_indices[i]
 95 |             else:
 96 |                 if i > 0:
 97 |                     return parent_indices[i - 1]
 98 |                 else:
 99 |                     return parent_indices[0]
100 | 
101 |     if len(parent_indices) > 0:
102 |         return parent_indices[-1]
103 | 
104 | 
105 | def get_loma(wavelet_matrix, scales, min_scale, max_scale):
106 |     """Get the Line Of Maximum Amplitude (loma)
107 | 
108 |     Parameters
109 |     ----------
110 |     wavelet_matrix: matrix of float
111 |         The wavelet matrix
112 |     scales: list of int
113 |         The list of scales
114 |     min_scale: int
115 |         The minimum scale
116 |     max_scale: int
117 |         The maximum scale
118 | 
119 |     Returns
120 |     -------
121 |     list of tuples
122 |     	?
123 | 
124 |     Note
125 |     ----
126 |     change this so that one level is done in one chunk, not one parent.
127 |     """
128 |     min_peak = -10000. # minimum peak amplitude to consider. NOTE:this has no meaning unless scales normalized
129 |     max_dist = 10 # how far in time to look for parent peaks. NOTE: frame rate and scale dependent
130 | 
131 |     # get peaks from the first scale
132 |     peaks, indices = get_peaks(wavelet_matrix[min_scale], min_peak)
133 | 
134 |     loma = dict()
135 |     root = dict()
136 |     for i in range(0, len(peaks)):
137 |         loma[indices[i]] = []
138 | 
139 |         # keep track of roots of each loma
140 |         root[indices[i]] = indices[i]
141 | 
142 |     for i in range(min_scale + 1, max_scale):
143 |         max_dist = np.sqrt(scales[i]) * 4
144 | 
145 |         # find peaks in the parent scale
146 |         p_peaks, p_indices = get_peaks(wavelet_matrix[i], min_peak)
147 |         parents = dict(zip(p_indices, p_peaks))
148 | 
149 |         # find a parent for each child peak
150 |         children = dict()
151 |         for p in p_indices:
152 |             children[p] = []
153 | 
154 |         parent_diff = np.diff(wavelet_matrix[i], 1)
155 |         for j in range(0, len(indices)):
156 |             parent =_get_parent(indices[j], parent_diff, p_indices)
157 |             if parent:
158 |                 if abs(parent - indices[j]) < max_dist and peaks[j] > min_peak:
159 |                     children[parent].append([indices[j], peaks[j]])
160 | 
161 |         # for each parent, select max child
162 |         peaks = []
163 |         indices = []
164 |         for p in children:
165 |             if len(children[p]) > 0:
166 |                 maxi = sorted(children[p], key=itemgetter(1))[-1]
167 |                 indices.append(p)
168 |                 peaks.append(maxi[1] + parents[p])
169 | 
170 |                 #append child to correct loma
171 |                 loma[root[maxi[0]]].append([maxi[0], maxi[1] + parents[p], i, p])
172 |                 root[p] = root[maxi[0]]
173 | 
174 |     sorted_loma = []
175 |     for k in sorted(loma.keys()):
176 |         if  len(loma[k]) > 0:
177 |             sorted_loma.append(loma[k])
178 | 
179 |     return sorted_loma
180 | 
181 | 
182 | def get_peaks(params, threshold=-10):
183 |     """Find the peaks based on the given prosodic parameters.
184 | 
185 |     Parameters
186 |     ----------
187 |     params: ?
188 |         Prosodic parameters
189 |     threshold: int
190 |         description
191 | 
192 |     Returns
193 |     -------
194 |     peaks: arraylike
195 |         array of peak values and peak indices
196 |     """
197 |     indices = (np.diff(np.sign(np.diff(params))) < 0).nonzero()[0] + 1
198 |     peaks = params[indices]
199 |     return np.array([peaks[peaks > threshold], indices[peaks > threshold]])
200 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/pitch_tracker.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import scipy.signal
  4 | 
  5 | import emphases
  6 | 
  7 | 
  8 | def _get_f0(spec, energy, min_hz, max_hz, thresh, sil_thresh):
  9 |     """
 10 |     return frequency bin with maximum energy, if it is over given threshold
 11 |     and overall energy of the frame is over silence threshsold
 12 |     otherwise return 0 (unvoiced)
 13 |     """
 14 |     cand = int(min_hz) + np.argmax(spec[int(min_hz):int(max_hz)])
 15 |     if spec[cand] > thresh and energy > sil_thresh:
 16 |         if cand > 2 * min_hz and spec[int(round(cand / 2.))] > spec[cand] * .5:
 17 |             return int(round(cand / 2.))
 18 |         else:
 19 |             return cand
 20 |     return 0
 21 | 
 22 | 
 23 | def _track_pitch(
 24 |     pic,
 25 |     min_hz=50,
 26 |     max_hz=450,
 27 |     thresh=.1,
 28 |     energy_thresh=1.):
 29 |     """
 30 |     extract pitch contour from time-frequency image
 31 |     bin with maximum energy / frame is chosen as a first f0 estimate,
 32 |     following with refinement steps based on the assumption of continuity of the pitch track
 33 |     """
 34 |     pitch = np.zeros(pic.shape[0])
 35 | 
 36 |     # calc energy threshold for voicing
 37 |     log_energy = np.log(np.sum(pic, axis=1))
 38 |     energy_thresh = \
 39 |         np.min(emphases.baselines.prominence.smooth_and_interp.smooth(log_energy, 20)) + energy_thresh
 40 |     pic_smooth = pic * scipy.ndimage.gaussian_filter(pic, [2, 5])
 41 | 
 42 |     # find frequency bins with max_energy
 43 |     for i in range(0, pic_smooth.shape[0]):
 44 |         pitch[i] = _get_f0(
 45 |             pic_smooth[i],
 46 |             log_energy[i],
 47 |             min_hz,
 48 |             max_hz,
 49 |             thresh,
 50 |             energy_thresh)
 51 | 
 52 |     # second pass with soft constraints
 53 |     n_iters = 3
 54 |     from scipy.signal import gaussian
 55 | 
 56 |     for iter in range(0, n_iters):
 57 |         smoothed = emphases.baselines.prominence.f0_processing.process(pitch)
 58 |         smoothed = emphases.baselines.prominence.smooth_and_interp.smooth(smoothed, int(200. / (iter + 1.)))
 59 | 
 60 |         # gradually thightening gaussian window centered on current estimate to softly constrain next iteration
 61 |         win_len = 800
 62 |         g_window = gaussian(win_len, int(np.mean(smoothed) * (1. / (iter + 1.) ** 2)))
 63 | 
 64 |         for i in range(0, pic.shape[0]):
 65 |             window = np.zeros(len(pic_smooth[i]))
 66 |             st = int(np.max((0, int(smoothed[i] - win_len))))
 67 |             end = int(np.min((int(smoothed[i] + win_len * .5), win_len - st)))
 68 |             window[st:end] = g_window[win_len - end:]
 69 |             pitch[i] = _get_f0(
 70 |                 pic_smooth[i] * window, log_energy[i],
 71 |                 min_hz,
 72 |                 max_hz,
 73 |                 thresh,
 74 |                 energy_thresh)
 75 | 
 76 |     return pitch
 77 | 
 78 | 
 79 | def _assign_to_bins(pic, freqs, mags):
 80 |     for i in range(1, freqs.shape[0] - 1):
 81 |         for j in range(0, freqs.shape[1]):
 82 |             try:
 83 |                 pic[j, int(freqs[i, j])] += mags[i, j]
 84 |             except:
 85 |                 pass
 86 | 
 87 | 
 88 | def inst_freq_pitch(
 89 |     wav_form,
 90 |     fs,
 91 |     min_hz=emphases.FMIN,
 92 |     max_hz=emphases.FMAX,
 93 |     voicing_thresh=emphases.VOICED_THRESHOLD,
 94 |     target_rate=200):
 95 |     """Extract speech f0 using the continuous wavelet transform"""
 96 |     voicing_thresh = (voicing_thresh - 50.) / 100.
 97 |     sample_rate = 4000
 98 |     tmp_wav_form = emphases.baselines.prominence.resample(wav_form, fs, sample_rate)
 99 |     tmp_wav_form = emphases.baselines.prominence.normalize(tmp_wav_form)
100 | 
101 |     DEC = int(round(sample_rate / target_rate))
102 | 
103 |     pic = np.zeros(
104 |         shape=(int(len(tmp_wav_form) / float(DEC)), int(sample_rate / 4.)))
105 | 
106 |     # use continuous wavelet transform to get instantenous frequencies
107 |     # integrate analyses with morlet mother wavelets with period = 5 for
108 |     # good time and frequency resolution
109 |     # setup wavelet
110 |     s0 = 2. / sample_rate
111 |     dj = .05 # 20 scales per octave
112 |     J = 120  # six octaves
113 |     dt = 1. / sample_rate
114 |     periods = [5]
115 |     for p in periods:
116 |         wavelet_matrix, *_ = emphases.baselines.prominence.cwt_utils.cwt_analysis(
117 |             tmp_wav_form,
118 |             mother_name='morlet',
119 |             first_scale=s0,
120 |             num_scales=J,
121 |             scale_distance=dj,
122 |             apply_coi=False,
123 |             period=p,
124 |             frame_rate=sample_rate)
125 | 
126 |         # hilbert transform
127 |         phase = np.unwrap(np.angle(wavelet_matrix), axis=1)
128 |         freqs =  np.abs((np.gradient(phase, dt)[1]) / (2. * np.pi))
129 | 
130 |         freqs = scipy.signal.decimate(freqs, DEC, zero_phase=True)
131 |         mags = scipy.signal.decimate(abs(wavelet_matrix), DEC, zero_phase=True)
132 | 
133 |         # normalize magnitudes
134 |         mags = (mags - mags.min()) / mags.ptp()
135 | 
136 |         # construct time-frequency image
137 |         _assign_to_bins(pic, freqs, mags)
138 | 
139 |     # perform frequency domain autocorrelation to enhance f0
140 |     pic = scipy.ndimage.filters.gaussian_filter(pic, [1, 1])
141 |     length = np.min((max_hz * 3, pic.shape[1])).astype(int)
142 | 
143 |     for i in range(0, pic.shape[0]):
144 |         acorr1 = np.correlate(pic[i, :length], pic[i, :length], mode='same')
145 |         pic[i, :int(length / 2.)] *= acorr1[int(len(acorr1) / 2.):]
146 | 
147 |     return _track_pitch(pic, min_hz, max_hz, voicing_thresh)
148 | 


--------------------------------------------------------------------------------
/emphases/baselines/prominence/smooth_and_interp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy import interpolate
 3 | 
 4 | 
 5 | def remove_bias(params, win_len=300):
 6 |     return params - smooth(params, win_len)
 7 | 
 8 | 
 9 | def interpolate_zeros(params, method='pchip', min_val=0):
10 |     """
11 |     Interpolate 0 values
12 |     :param params: 1D data vector
13 |     :param method:
14 |     :param factor: factor for interpolation (must be integer)
15 |     :return: interpolated 1D vector by a given factor
16 |     """
17 |     voiced = np.array(params, float)
18 |     for i in range(0, len(voiced)):
19 |         if voiced[i] == min_val:
20 |             voiced[i] = np.nan
21 | 
22 |     if np.isnan(voiced[-1]):
23 |         voiced[-1] = np.nanmin(voiced)
24 |     if np.isnan(voiced[0]):
25 |         voiced[0] = np.nanmean(voiced)
26 | 
27 |     not_nan = np.logical_not(np.isnan(voiced))
28 | 
29 |     indices = np.arange(len(voiced))
30 |     if method == 'spline':
31 |         interp = interpolate.UnivariateSpline(
32 |             indices[not_nan],
33 |             voiced[not_nan],
34 |             k=2,
35 |             s=0)
36 |         # return voiced parts intact
37 |         smoothed = interp(indices)
38 |         for i in range(0, len(smoothed)):
39 |             if not np.isnan(voiced[i]):
40 |                 smoothed[i] = params[i]
41 |         return smoothed
42 | 
43 |     elif method == 'pchip':
44 |         interp = interpolate.pchip(indices[not_nan], voiced[not_nan])
45 |     else:
46 |         interp = interpolate.interp1d(
47 |             indices[not_nan],
48 |             voiced[not_nan],
49 |             method)
50 |     return interp(indices)
51 | 
52 | 
53 | def smooth(params, win, type='HAMMING'):
54 |     """gaussian type smoothing, convolution with hamming window"""
55 |     win = int(win + .5)
56 |     if win >= len(params) - 1:
57 |         win = len(params) - 1
58 | 
59 |     if win % 2 == 0:
60 |         win += 1
61 | 
62 |     s = np.r_[params[win - 1:0:-1], params, params[-1:-win:-1]]
63 | 
64 |     if type == 'HAMMING':
65 |         w = np.hamming(win)
66 |     else:
67 |         w = np.ones(win)
68 | 
69 |     y = np.convolve(w / w.sum(), s, mode='valid')
70 |     return y[int(win / 2):-int(win / 2)]
71 | 
72 | 
73 | def peak_smooth(params, max_iter, win, min_win=2, voicing=[]):
74 |     """Iterative smoothing while preserving peaks, 'true envelope' -style"""
75 |     smoothed = np.array(params)
76 |     win_reduce = np.exp(np.linspace(np.log(win), np.log(min_win), max_iter))
77 | 
78 |     for i in range(0, max_iter):
79 | 
80 |         smoothed = np.maximum(params, smoothed)
81 | 
82 |         if len(voicing) > 0:
83 |             smoothed = smooth(smoothed, int(win + .5))
84 |             smoothed[voicing > 0] = params[voicing > 0]
85 |         else:
86 |             smoothed = smooth(smoothed, int(win + .5), type='rectangle')
87 | 
88 |         win = win_reduce[i]
89 | 
90 |     return smoothed
91 | 


--------------------------------------------------------------------------------
/emphases/config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/config/__init__.py


--------------------------------------------------------------------------------
/emphases/config/defaults.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | 
  4 | import torch
  5 | import GPUtil
  6 | 
  7 | 
  8 | ###############################################################################
  9 | # Metadata
 10 | ###############################################################################
 11 | 
 12 | 
 13 | # Configuration name
 14 | CONFIG = 'emphases'
 15 | 
 16 | 
 17 | ###############################################################################
 18 | # Directories
 19 | ###############################################################################
 20 | 
 21 | 
 22 | # Location to save assets to be bundled with pip release
 23 | ASSETS_DIR = Path(__file__).parent.parent / 'assets'
 24 | 
 25 | # Location of preprocessed features
 26 | CACHE_DIR = Path(__file__).parent.parent.parent / 'data' / 'cache'
 27 | 
 28 | # Location of datasets on disk
 29 | DATA_DIR = Path(__file__).parent.parent.parent / 'data' / 'datasets'
 30 | 
 31 | # Location to save evaluation artifacts
 32 | EVAL_DIR = Path(__file__).parent.parent.parent / 'eval'
 33 | 
 34 | # Location to save training and adaptation artifacts
 35 | RUNS_DIR = Path(__file__).parent.parent.parent / 'runs'
 36 | 
 37 | # Location of compressed datasets on disk
 38 | SOURCE_DIR = Path(__file__).parent.parent.parent / 'data' / 'sources'
 39 | 
 40 | 
 41 | ###############################################################################
 42 | # Audio parameters
 43 | ###############################################################################
 44 | 
 45 | 
 46 | # The maximum representable frequency
 47 | FMAX = 550.
 48 | 
 49 | # The minumum representable frequency
 50 | FMIN = 40.
 51 | 
 52 | # The number of samples between frames
 53 | HOPSIZE = 160
 54 | 
 55 | # Minimum decibel level
 56 | MIN_DB = -100.
 57 | 
 58 | # Number of linear frequency channels
 59 | NUM_FFT =  1024
 60 | 
 61 | # Number of mel channels
 62 | NUM_MELS = 80
 63 | 
 64 | # Voiced/unvoiced threshold for pitch estimation
 65 | VOICED_THRESHOLD = .1625
 66 | 
 67 | # Reference decibel level
 68 | REF_DB = 20.
 69 | 
 70 | # The audio samling rate
 71 | SAMPLE_RATE = 16000
 72 | 
 73 | # The size of the audio analysis window
 74 | WINDOW_SIZE = 1024
 75 | 
 76 | 
 77 | ###############################################################################
 78 | # Data parameters
 79 | ###############################################################################
 80 | 
 81 | 
 82 | # List of all datasets
 83 | DATASETS = ['libritts']
 84 | 
 85 | # Datasets to use for evaluation
 86 | EVALUATION_DATASETS = ['libritts']
 87 | 
 88 | # Whether to use mel features
 89 | MEL_FEATURE = True
 90 | 
 91 | # Whether to use loudness features
 92 | LOUDNESS_FEATURE = False
 93 | 
 94 | # Maximum number of allowed annotations
 95 | MAX_ANNOTATIONS = None
 96 | 
 97 | # Maximum number of training utterances
 98 | MAX_TRAINING_UTTERANCES = None
 99 | 
100 | # Minimum number of allowed annotations
101 | MIN_ANNOTATIONS = None
102 | 
103 | # Normalize input representations
104 | NORMALIZE = False
105 | 
106 | # Whether to use the specified one-eighth dataset for scaling law experiments
107 | ONE_EIGHTH_UTTERANCES = False
108 | 
109 | # Whether to use pitch features
110 | PITCH_FEATURE = False
111 | 
112 | # Whether to use periodicity features
113 | PERIODICITY_FEATURE = False
114 | 
115 | # Seed for all random number generators
116 | RANDOM_SEED = 0
117 | 
118 | # Size of each partition. Must add to 1.
119 | SPLIT_SIZE_TEST = .1
120 | SPLIT_SIZE_TRAIN = .8
121 | SPLIT_SIZE_VALID = .1
122 | 
123 | # Dataset to use for training
124 | TRAINING_DATASET = 'libritts'
125 | 
126 | # Dataset to use for validation
127 | VALIDATION_DATASET = 'libritts'
128 | 
129 | 
130 | ###############################################################################
131 | # Evaluation parameters
132 | ###############################################################################
133 | 
134 | 
135 | # Number of steps between logging to Tensorboard
136 | LOG_INTERVAL = 100  # steps
137 | 
138 | # Number of steps to perform for tensorboard logging
139 | LOG_STEPS = 32
140 | 
141 | # Number of examples to plot to Tensorboard during training
142 | PLOT_EXAMPLES = 2
143 | 
144 | 
145 | ###############################################################################
146 | # Wavelet baseline parameters
147 | ###############################################################################
148 | 
149 | 
150 | # Line of maximum amplitude bounds
151 | LOMA_BOUNDARY_START = -2  # octaves
152 | LOMA_BOUNDARY_END = 1  # octaves
153 | LOMA_PROMINENCE_START = -3  # octaves
154 | LOMA_PROMINENCE_END = 0  # octaves
155 | 
156 | # Weight applied to the duration
157 | PROMINENCE_DURATION_WEIGHT = .5
158 | 
159 | # Maximum frequency in energy calculation
160 | PROMINENCE_ENERGY_MAX = 5000.
161 | 
162 | # Minimum frequency in energy calculation
163 | PROMINENCE_ENERGY_MIN = 200.
164 | 
165 | # Weight applied to the energy
166 | PROMINENCE_ENERGY_WEIGHT = 1.
167 | 
168 | # Weight applied to the pitch
169 | PROMINENCE_PITCH_WEIGHT = 1.
170 | 
171 | # Voiced/unvoiced threshold from 0 (all voiced) to 100 (all unvoiced)
172 | VOICED_THRESHOLD = 50
173 | 
174 | 
175 | ###############################################################################
176 | # Model parameters
177 | ###############################################################################
178 | 
179 | 
180 | # Activation function to use in convolution model
181 | ACTIVATION_FUNCTION = torch.nn.ReLU
182 | 
183 | # Model architecture. One of ['convolution', 'transformer'].
184 | ARCHITECTURE = 'convolution'
185 | 
186 | # Model width
187 | CHANNELS = 80
188 | 
189 | # Decoder convolution kernel size
190 | DECODER_KERNEL_SIZE = 3
191 | 
192 | # Dropout probability (or None to not use dropout)
193 | DROPOUT = None
194 | 
195 | # Location to perform resampling from frame resolution to word resolution.
196 | # One of ['inference', 'input', 'intermediate', 'loss'].
197 | DOWNSAMPLE_LOCATION = 'intermediate'
198 | 
199 | # Method to use for resampling from frame resolution to word resolution.
200 | # One of ['average', 'center', 'max', 'sum'].
201 | DOWNSAMPLE_METHOD = 'sum'
202 | 
203 | # Encoder convolution kernel size
204 | ENCODER_KERNEL_SIZE = 3
205 | 
206 | # Number of network layers
207 | LAYERS = 6
208 | 
209 | # Method to use for inference. One of
210 | # ['neural', 'pitch-variance', 'duration-variance', 'prominence].
211 | METHOD = 'neural'
212 | 
213 | # Method to use for resampling from word resolution to frame resolution.
214 | # One of ['linear', 'nearest'].
215 | UPSAMPLE_METHOD = 'linear'
216 | 
217 | 
218 | ###############################################################################
219 | # Training parameters
220 | ###############################################################################
221 | 
222 | 
223 | # Number of buckets of data lengths used by the sampler
224 | BUCKETS = 2
225 | 
226 | # Loss function. One of ['bce', 'mse']
227 | LOSS = 'bce'
228 | 
229 | # Maximum number of frames in one batch
230 | MAX_TRAINING_FRAMES = 75000
231 | 
232 | # Number of training steps
233 | NUM_STEPS = 6000
234 | 
235 | # Number of data loading worker threads
236 | try:
237 |     NUM_WORKERS = int(os.cpu_count() / max(1, len(GPUtil.getGPUs())))
238 | except ValueError:
239 |     NUM_WORKERS = os.cpu_count()
240 | 


--------------------------------------------------------------------------------
/emphases/config/static.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Files and directories
 8 | ###############################################################################
 9 | 
10 | 
11 | # Directory to save annotation artifacts
12 | ANNOTATION_DIR = emphases.SOURCE_DIR / 'crowdsource'
13 | 
14 | # Default configuration file for emphasis annotation
15 | DEFAULT_ANNOTATION_CONFIG = emphases.ASSETS_DIR / 'configs' / 'annotate.yaml'
16 | 
17 | # Location to save dataset partitions
18 | PARTITION_DIR = emphases.ASSETS_DIR / 'partitions'
19 | 
20 | 
21 | ###############################################################################
22 | # Audio parameters
23 | ###############################################################################
24 | 
25 | 
26 | # The hopsize in seconds
27 | HOPSIZE_SECONDS = emphases.HOPSIZE / emphases.SAMPLE_RATE
28 | 
29 | # The maximum representable frequency in log-hz
30 | LOGFMAX = torch.log2(torch.tensor(emphases.FMAX))
31 | 
32 | # The minumum representable frequency in log-hz
33 | LOGFMIN = torch.log2(torch.tensor(emphases.FMIN))
34 | 
35 | 
36 | ###############################################################################
37 | # Model parameters
38 | ###############################################################################
39 | 
40 | 
41 | # Number of input features to the model
42 | NUM_FEATURES = (
43 |     emphases.MEL_FEATURE * emphases.NUM_MELS +
44 |     int(emphases.PITCH_FEATURE) +
45 |     int(emphases.PERIODICITY_FEATURE) +
46 |     int(emphases.LOUDNESS_FEATURE))
47 | 


--------------------------------------------------------------------------------
/emphases/convert.py:
--------------------------------------------------------------------------------
 1 | import emphases
 2 | 
 3 | 
 4 | ###############################################################################
 5 | # Time conversions
 6 | ###############################################################################
 7 | 
 8 | 
 9 | def frames_to_samples(frames):
10 |     """Convert number of frames to samples"""
11 |     return frames * emphases.HOPSIZE
12 | 
13 | 
14 | def frames_to_seconds(frames):
15 |     """Convert number of frames to seconds"""
16 |     return frames * emphases.HOPSIZE_SECONDS
17 | 
18 | 
19 | def seconds_to_frames(seconds):
20 |     """Convert seconds to number of frames"""
21 |     return samples_to_frames(seconds_to_samples(seconds))
22 | 
23 | 
24 | def seconds_to_samples(seconds):
25 |     """Convert seconds to number of samples"""
26 |     return seconds * emphases.SAMPLE_RATE
27 | 
28 | 
29 | def samples_to_frames(samples):
30 |     """Convert samples to number of frames"""
31 |     return samples // emphases.HOPSIZE
32 | 
33 | 
34 | def samples_to_seconds(samples):
35 |     """Convert number of samples to seconds"""
36 |     return samples / emphases.SAMPLE_RATE
37 | 


--------------------------------------------------------------------------------
/emphases/data/__init__.py:
--------------------------------------------------------------------------------
1 | from . import download
2 | from . import preprocess
3 | from .collate import collate
4 | from .dataset import Dataset
5 | from .loader import loader
6 | from .sampler import sampler
7 | 


--------------------------------------------------------------------------------
/emphases/data/collate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Batch collation
 8 | ###############################################################################
 9 | 
10 | 
11 | def collate(batch):
12 |     """Batch collation"""
13 |     # Unpack
14 |     features, scores, word_bounds, alignments, audios, stems = zip(*batch)
15 | 
16 |     # Get word lengths
17 |     word_lengths = torch.tensor(
18 |         [bounds.shape[-1] for bounds in word_bounds],
19 |         dtype=torch.long)
20 |     max_word_length = word_lengths.max().item()
21 | 
22 |     # Get frame lengths
23 |     frame_lengths = torch.tensor(
24 |         [feat.shape[-1] for feat in features],
25 |         dtype=torch.long)
26 |     max_frame_length = frame_lengths.max().item()
27 | 
28 |     # Network output lengths
29 |     output_lengths = word_lengths
30 |     max_output_length = max_word_length
31 | 
32 |     # Allocate padded tensors
33 |     padded_features = torch.zeros(
34 |         (len(features), emphases.NUM_FEATURES, max_frame_length))
35 |     padded_scores = torch.zeros((len(scores), 1, max_output_length))
36 |     padded_bounds = torch.zeros(
37 |         (len(word_bounds), 2, max_word_length),
38 |         dtype=torch.long)
39 |     padded_audio = torch.zeros(
40 |         (len(audios), 1, max_frame_length * emphases.HOPSIZE))
41 | 
42 |     # Place batch in padded tensors
43 |     for (
44 |         i,
45 |         (bounds, audio, feat, score, frame_length, word_length, output_length)
46 |     ) in enumerate(
47 |         zip(
48 |             word_bounds,
49 |             audios,
50 |             features,
51 |             scores,
52 |             frame_lengths,
53 |             word_lengths,
54 |             output_lengths)
55 |     ):
56 | 
57 |         # Pad features
58 |         padded_features[i, :, :frame_length] = feat
59 | 
60 |         # Pad scores
61 |         padded_scores[i, :, :output_length] = score[:, :output_length]
62 | 
63 |         # Pad word bounds
64 |         padded_bounds[i, :, :word_length] = bounds[:, :word_length]
65 | 
66 |         # Pad audio
67 |         end_sample = frame_length * emphases.HOPSIZE
68 |         padded_audio[i, :, :end_sample] = audio[:, :end_sample]
69 | 
70 |     return (
71 |         padded_features,
72 |         frame_lengths,
73 |         padded_bounds,
74 |         word_lengths,
75 |         padded_scores,
76 |         alignments,
77 |         padded_audio,
78 |         stems)
79 | 


--------------------------------------------------------------------------------
/emphases/data/dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import numpy as np
  4 | import pypar
  5 | import torch
  6 | import torchaudio
  7 | 
  8 | import emphases
  9 | 
 10 | 
 11 | ###############################################################################
 12 | # Dataset
 13 | ###############################################################################
 14 | 
 15 | 
 16 | class Dataset(torch.utils.data.Dataset):
 17 | 
 18 |     def __init__(self, name, partition):
 19 |         self.cache = emphases.CACHE_DIR / name
 20 | 
 21 |         # Get list of stems
 22 |         with open(emphases.PARTITION_DIR / f'{name}.json') as file:
 23 |             self.stems = json.load(file)[partition]
 24 | 
 25 |         # Store lengths for bucketing
 26 |         audio_files = [
 27 |             self.cache / 'audio' / f'{stem}.wav' for stem in self.stems]
 28 |         self.lengths = [
 29 |             emphases.convert.samples_to_frames(
 30 |                 torchaudio.info(audio_file).num_frames)
 31 |             for audio_file in audio_files]
 32 | 
 33 |         # Total number of frames
 34 |         self.frames = sum(self.lengths)
 35 | 
 36 |     def __getitem__(self, index):
 37 |         """Retrieve the indexth item"""
 38 |         stem = self.stems[index]
 39 | 
 40 |         # Load alignment
 41 |         alignment = pypar.Alignment(
 42 |             self.cache / 'alignment' / f'{stem}.TextGrid')
 43 | 
 44 |         # Compute word bounds
 45 |         bounds = alignment.word_bounds(
 46 |             emphases.SAMPLE_RATE,
 47 |             emphases.HOPSIZE,
 48 |             silences=True)
 49 |         word_bounds = torch.cat(
 50 |             [torch.tensor(bound)[None] for bound in bounds]).T
 51 | 
 52 |         # Load audio
 53 |         audio = emphases.load.audio(self.cache / 'audio' / f'{stem}.wav')
 54 | 
 55 |         features = []
 56 | 
 57 |         # Load mels
 58 |         if emphases.MEL_FEATURE:
 59 |             features.append(torch.load(self.cache / 'mels' / f'{stem}.pt'))
 60 | 
 61 |         # Load pitch
 62 |         if emphases.PITCH_FEATURE:
 63 |             pitch = torch.load(self.cache / 'pitch' / f'{stem}-pitch.pt')
 64 |             if emphases.NORMALIZE:
 65 |                 features.append(
 66 |                     (torch.log2(pitch) - emphases.LOGFMIN) /
 67 |                     (emphases.LOGFMAX - emphases.LOGFMIN))
 68 |             else:
 69 |                 features.append(torch.log2(pitch))
 70 | 
 71 |         # Load periodicity
 72 |         if emphases.PERIODICITY_FEATURE:
 73 |             periodicity = torch.load(
 74 |                 self.cache / 'pitch' / f'{stem}-periodicity.pt')
 75 |             features.append(periodicity)
 76 | 
 77 |         # Load loudness
 78 |         if emphases.LOUDNESS_FEATURE:
 79 |             loudness = torch.load(self.cache / 'loudness' / f'{stem}.pt')
 80 |             features.append(loudness)
 81 | 
 82 |         # Concatenate
 83 |         features = features[0] if len(features) == 1 else torch.cat(features)
 84 | 
 85 |         # Load per-word ground truth emphasis scores
 86 |         scores = torch.load(self.cache / 'scores' / f'{stem}.pt')[None]
 87 | 
 88 |         return features, scores, word_bounds, alignment, audio, stem
 89 | 
 90 |     def __len__(self):
 91 |         """Length of the dataset"""
 92 |         return len(self.stems)
 93 | 
 94 |     def buckets(self):
 95 |         """Partition indices into buckets based on length for sampling"""
 96 |         # Get the size of a bucket
 97 |         size = len(self) // emphases.BUCKETS
 98 | 
 99 |         # Get indices in order of length
100 |         indices = np.argsort(self.lengths)
101 |         lengths = np.sort(self.lengths)
102 | 
103 |         # Split into buckets based on length
104 |         buckets = [
105 |             np.stack((indices[i:i + size], lengths[i:i + size])).T
106 |             for i in range(0, len(self), size)]
107 | 
108 |         # Concatenate partial bucket
109 |         if len(buckets) == emphases.BUCKETS + 1:
110 |             residual = buckets.pop()
111 |             buckets[-1] = np.concatenate((buckets[-1], residual), axis=0)
112 | 
113 |         return buckets
114 | 


--------------------------------------------------------------------------------
/emphases/data/download/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | 


--------------------------------------------------------------------------------
/emphases/data/download/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Download datasets
 8 | ###############################################################################
 9 | 
10 | 
11 | def parse_args():
12 |     """Parse command-line arguments"""
13 |     parser = argparse.ArgumentParser(description='Download datasets')
14 |     parser.add_argument(
15 |         '--datasets',
16 |         nargs='+',
17 |         default=emphases.DATASETS,
18 |         help='The datasets to download')
19 |     parser.add_argument(
20 |         '--gpu',
21 |         type=int,
22 |         help='The index of the gpu to run inference on')
23 |     return parser.parse_known_args()[0]
24 | 
25 | 
26 | emphases.data.download.datasets(**vars(parse_args()))
27 | 


--------------------------------------------------------------------------------
/emphases/data/download/core.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | import shutil
  4 | import ssl
  5 | import tarfile
  6 | import urllib
  7 | import yaml
  8 | 
  9 | import pyfoal
 10 | import pypar
 11 | import torch
 12 | import torchutil
 13 | import torchaudio
 14 | 
 15 | import emphases
 16 | 
 17 | 
 18 | ###############################################################################
 19 | # Constants
 20 | ###############################################################################
 21 | 
 22 | 
 23 | # List of tokens to filter from Buckeye annotations
 24 | BUCKEYE_FILTER_LIST = [
 25 |     '{B_TRANS}',
 26 |     '{E_TRANS}',
 27 |     '<CUTOFF-i=?>',
 28 |     '<CUTOFF-ta=taking?>',
 29 |     '<CUTOFF-th=that>',
 30 |     '<IVER>',
 31 |     '<LAUGH>',
 32 |     '<SIL>',
 33 |     '<VOCNOISE>',
 34 |     '<and>',
 35 |     '<i>',
 36 |     '<out>',
 37 |     '<that>',
 38 |     '<think>',
 39 |     '<so>',
 40 |     '<some>',
 41 |     '<um>',
 42 |     '<xx>',
 43 | ]
 44 | 
 45 | # Speakers selected by sorting the train-clean-100 speakers by longest total
 46 | # recording duration and manually selecting speakers with more natural,
 47 | # conversational (as opposed to read) prosody
 48 | LIBRITTS_SPEAKERS = [
 49 |     # Top 5 Female (primarily by length)
 50 |     40,
 51 |     669,
 52 |     4362,
 53 |     5022,
 54 |     8123,
 55 | 
 56 |     # Additional female speakers to get to 1/8th of train-clean-100
 57 |     5022,
 58 |     696,
 59 |     6272,
 60 |     5163,
 61 | 
 62 |     # Top 5 Male (primarily by length)
 63 |     196,
 64 |     460,
 65 |     1355,
 66 |     3664,
 67 |     7067,
 68 | 
 69 |     # Additional male speakers to get to 1/8th of train-clean-100
 70 |     405,
 71 |     6437,
 72 |     446,
 73 |     4397
 74 | ]
 75 | 
 76 | 
 77 | ###############################################################################
 78 | # Download datasets
 79 | ###############################################################################
 80 | 
 81 | 
 82 | @torchutil.notify('download')
 83 | def datasets(datasets, gpu=None):
 84 |     """Download datasets"""
 85 |     for dataset in datasets:
 86 |         if dataset == 'automatic':
 87 |             automatic(gpu=gpu)
 88 |         elif dataset == 'buckeye':
 89 |             buckeye()
 90 |         elif dataset == 'crowdsource':
 91 |             crowdsource()
 92 |         elif dataset == 'libritts':
 93 |             libritts()
 94 |         else:
 95 |             raise ValueError(f'Dataset {dataset} is not defined')
 96 | 
 97 | 
 98 | ###############################################################################
 99 | # Individual dataset downloaders
100 | ###############################################################################
101 | 
102 | 
103 | def automatic(gpu=None):
104 |     """Create dataset from trained model"""
105 |     # Setup directories
106 |     cache_directory = emphases.CACHE_DIR / 'automatic'
107 |     cache_directory.mkdir(exist_ok=True, parents=True)
108 | 
109 |     # Create subdirectories
110 |     features = ['alignment', 'audio', 'scores']
111 |     for feature in features:
112 |         (cache_directory / feature).mkdir(exist_ok=True, parents=True)
113 | 
114 |     # Get files
115 |     audio_files = list(
116 |         (emphases.CACHE_DIR / 'libritts' / 'audio').rglob('*.wav'))
117 |     stems = [file.stem for file in audio_files]
118 | 
119 |     # Copy from LibriTTS cache to annotation cache
120 |     for stem in stems:
121 | 
122 |         # Copy audio
123 |         audio_file = (
124 |             emphases.CACHE_DIR / 'automatic' / 'audio' / f'{stem}.wav')
125 |         shutil.copyfile(
126 |             emphases.CACHE_DIR / 'libritts' / 'audio' / f'{stem}.wav',
127 |             audio_file)
128 | 
129 |         # Copy alignment
130 |         shutil.copyfile(
131 |             emphases.CACHE_DIR / 'libritts' / 'alignment' / f'{stem}.TextGrid',
132 |             emphases.CACHE_DIR / 'automatic' / 'alignment' / f'{stem}.TextGrid')
133 | 
134 |         # Load alignment
135 |         alignment = pypar.Alignment(
136 |             emphases.CACHE_DIR / 'automatic' / 'alignment' / f'{stem}.TextGrid')
137 | 
138 |         # Load audio
139 |         audio, _ = torchaudio.load(audio_file)
140 | 
141 |         # Infer scores
142 |         scores = emphases.from_alignment_and_audio(
143 |             alignment,
144 |             audio,
145 |             emphases.SAMPLE_RATE,
146 |             gpu=gpu).detach().cpu()
147 | 
148 |         # Save scores
149 |         torch.save(scores, cache_directory / 'scores' / f'{stem}.pt')
150 | 
151 | 
152 | def crowdsource():
153 |     """Prepare crowdsourced dataset"""
154 |     # Get annotation config
155 |     with open(emphases.DEFAULT_ANNOTATION_CONFIG, "r") as stream:
156 |         annotation_config = yaml.safe_load(stream)
157 | 
158 |     # Setup directories
159 |     data_directory = emphases.DATA_DIR / 'crowdsource'
160 |     cache_directory = emphases.CACHE_DIR / 'crowdsource'
161 |     cache_directory.mkdir(exist_ok=True, parents=True)
162 | 
163 |     # Create subdirectories
164 |     features = ['alignment', 'audio', 'scores']
165 |     for feature in features:
166 |         (cache_directory / feature).mkdir(exist_ok=True, parents=True)
167 | 
168 |     # Load annotations data
169 |     annotation_data = {}
170 |     for directory in data_directory.glob('*'):
171 | 
172 |         source_directory =  directory / annotation_config['name']
173 |         table_directory = source_directory / 'tables'
174 | 
175 |         # Participant data
176 |         participants = {}
177 |         with open(table_directory / 'participants.csv') as file:
178 |             for row in csv.DictReader(file):
179 |                 try:
180 | 
181 |                     # Crowdsourced annotation
182 |                     participants[row['ID']] = {
183 |                         'language': row['Language'],
184 |                         'country': row['Country'],
185 |                         'annotations': []}
186 | 
187 |                 except KeyError as error:
188 | 
189 |                     # Manual annotation
190 |                     participants[row['ID']] = {
191 |                         'language': 'English',
192 |                         'country': 'United States',
193 |                         'annotations': []}
194 | 
195 |         # Response data
196 |         with open(table_directory / 'responses.csv') as file:
197 |             for row in csv.DictReader(file):
198 |                 participant = row['Participant']
199 | 
200 |                 # Add participant
201 |                 if participant not in annotation_data:
202 |                     annotation_data[participant] = participants[participant]
203 | 
204 |                 # Get word start and end times
205 |                 alignment = pypar.Alignment(
206 |                     emphases.CACHE_DIR /
207 |                     'libritts' /
208 |                     'alignment' /
209 |                     f'{row["Stem"]}.TextGrid')
210 |                 words = [
211 |                     (str(word).lower(), word.start(), word.end())
212 |                     for word in alignment
213 |                     if str(word) != pypar.SILENCE]
214 | 
215 |                 # Format annotation
216 |                 entry = {
217 |                     'stem': row['Stem'],
218 |                     'score': [float(c) for c in row['Response']],
219 |                     'words': words}
220 |                 assert len(entry['words']) == len(entry['score'])
221 | 
222 |                 # Add annotation
223 |                 annotation_data[participant]['annotations'].append(entry)
224 | 
225 |     # Get worker ID correspondence
226 |     correspondence = {}
227 |     for directory in data_directory.glob('*'):
228 |         file = (
229 |             directory /
230 |             annotation_config['name'] /
231 |             'crowdsource' /
232 |             'crowdsource.json')
233 |         with open(file) as file:
234 |             contents = json.load(file)
235 |             for content in contents:
236 |                 correspondence |= {content['ParticipantID']: content['WorkerId']}
237 | 
238 |     # Crowdsourced annotation
239 |     if correspondence:
240 | 
241 |         # Filter out where incomplete or > 1/3 examples have > 2/3 words selected
242 |         def valid(items):
243 |             if not hasattr(valid, 'count'):
244 |                 valid.count = 0
245 |             sums = [sum(item['score']) for item in items]
246 |             counts = [len(item['score']) for item in items]
247 |             invalids = [s > .67 * c for s, c in zip(sums, counts)]
248 |             is_valid = sum(invalids) < .33 * len(invalids)
249 |             valid.count += 1 - int(is_valid)
250 |             return is_valid
251 | 
252 |         # Join participants with same worker ID
253 |         joined = {}
254 |         for participant, contents in annotation_data.items():
255 | 
256 |             # Filter out bad batches
257 |             if (
258 |                 len(contents['annotations']) < 20 or
259 |                 len(contents['annotations']) % 10 > 0 or
260 |                 not valid(contents['annotations'])
261 |             ):
262 |                 continue
263 | 
264 |             worker = correspondence[participant]
265 |             if worker in joined:
266 |                 joined[worker]['annotations'].extend(contents['annotations'])
267 |             else:
268 |                 joined[worker] = contents
269 | 
270 |     # Manual annotation
271 |     else:
272 |         joined = annotation_data
273 | 
274 |     # Anonymize
275 |     anonymized = {}
276 |     for i, contents in enumerate(joined.values()):
277 |         anonymized[f'{i:06d}'] = contents
278 | 
279 |     # Save annotations in release format
280 |     with open(cache_directory / 'annotations.json', 'w') as file:
281 |         json.dump(anonymized, file, sort_keys=True, indent=True)
282 | 
283 |     # Merge binary annotations to floats
284 |     annotations = merge_annotations(anonymized)
285 | 
286 |     # Save dictionary containing annotation counts
287 |     with open(cache_directory / 'counts.json', 'w') as file:
288 |         json.dump(annotations['stems'], file, sort_keys=True, indent=True)
289 | 
290 |     # Get annotated stems
291 |     stems = [
292 |         file.replace('libritts-', '')
293 |         for file in annotations['stems'].keys()]
294 | 
295 |     # Copy from LibriTTS cache to annotation cache
296 |     for i, stem in enumerate(stems):
297 | 
298 |         # Get normalized scores
299 |         count = annotations['stems'][stem]
300 |         labels = [score / count for score in annotations['scores'][stem]]
301 | 
302 |         # Copy audio
303 |         shutil.copyfile(
304 |             emphases.CACHE_DIR / 'libritts' / 'audio' / f'{stem}.wav',
305 |             emphases.CACHE_DIR / 'crowdsource' / 'audio' / f'{stem}.wav')
306 | 
307 |         # Copy alignment
308 |         shutil.copyfile(
309 |             emphases.CACHE_DIR / 'libritts' / 'alignment' / f'{stem}.TextGrid',
310 |             emphases.CACHE_DIR / 'crowdsource' / 'alignment' / f'{stem}.TextGrid')
311 | 
312 |         # Load alignment
313 |         alignment = pypar.Alignment(
314 |             emphases.CACHE_DIR / 'crowdsource' / 'alignment' / f'{stem}.TextGrid')
315 | 
316 |         # Match alignment and scores (silences get a score of zero)
317 |         j = 0
318 |         scores = torch.zeros(len(alignment))
319 |         for i, word in enumerate(alignment):
320 | 
321 |             # Keep silences as zero
322 |             if str(word) == pypar.SILENCE:
323 |                 continue
324 | 
325 |             # Update scores
326 |             scores[i] = float(labels[j])
327 | 
328 |             j += 1
329 | 
330 |         # Save scores
331 |         torch.save(scores, cache_directory / 'scores' / f'{stem}.pt')
332 | 
333 | 
334 | def buckeye():
335 |     """Download buckeye dataset"""
336 |     # Extract tar file to data directory
337 |     file = emphases.SOURCE_DIR / 'buckeye' / 'buckeye.tar.gz'
338 |     with tarfile.open(file, 'r:gz') as tfile:
339 |         tfile.extractall(emphases.DATA_DIR)
340 | 
341 |     # Setup cache directory
342 |     cache_directory = emphases.CACHE_DIR / 'buckeye'
343 |     cache_directory.mkdir(exist_ok=True, parents=True)
344 | 
345 |     # Create subdirectories
346 |     features = ['alignment', 'audio', 'scores']
347 |     for feature in features:
348 |         (cache_directory / feature).mkdir(exist_ok=True, parents=True)
349 | 
350 |     # Copy alignments and filter out unused tokens
351 |     data_directory = emphases.DATA_DIR / 'buckeye'
352 |     alignment_files = (data_directory / 'alignment').glob('*.TextGrid')
353 |     for file in alignment_files:
354 | 
355 |         # Load alignment
356 |         alignment = pypar.Alignment(file)
357 | 
358 |         # Filter
359 |         for word in alignment:
360 |             if str(word) in BUCKEYE_FILTER_LIST:
361 |                 word.word = pypar.SILENCE
362 |                 word.phonemes = [
363 |                     pypar.Phoneme(pypar.SILENCE, word.start(), word.end())]
364 | 
365 |         # Deduplicate silence tokens
366 |         i = 0
367 |         words = alignment.words()
368 |         prev_silence = False
369 |         while i < len(words):
370 |             word = words[i]
371 |             if str(word) == pypar.SILENCE:
372 |                 if prev_silence:
373 |                     words[i - 1][-1]._end = word.end()
374 |                     del words[i]
375 |                 else:
376 |                     prev_silence = True
377 |                     i += 1
378 |             else:
379 |                 prev_silence = False
380 |                 i += 1
381 | 
382 |         # Save alignment
383 |         pypar.Alignment(words).save(
384 |             cache_directory / 'alignment' / f'{file.stem}.TextGrid')
385 | 
386 |     # Get audio files
387 |     audio_files = sorted((data_directory / 'audio').glob('*.wav'))
388 | 
389 |     # Resample audio
390 |     for audio_file in audio_files:
391 | 
392 |         # Load and resample
393 |         audio = emphases.load.audio(audio_file)
394 | 
395 |         # If audio is too quiet, increase the volume
396 |         maximum = torch.abs(audio).max()
397 |         if maximum < .35:
398 |             audio *= .35 / maximum
399 | 
400 |         # Save to disk
401 |         torchaudio.save(
402 |             cache_directory / 'audio' / audio_file.name,
403 |             audio,
404 |             emphases.SAMPLE_RATE)
405 | 
406 |     # Read buckeye annotations
407 |     data_directory = emphases.DATA_DIR / 'buckeye'
408 |     with open(data_directory  / 'annotations.csv') as file:
409 |         reader = csv.DictReader(file)
410 |         annotations = [row for row in reader]
411 | 
412 |     # Extract per-word emphasis scores
413 |     alignment_files = (cache_directory / 'alignment').glob('*.TextGrid')
414 |     for file in alignment_files:
415 | 
416 |         # Load alignment
417 |         alignment = pypar.Alignment(file)
418 | 
419 |         # Get words from annotation
420 |         words = [word for word in annotations if word['filename'] == file.stem]
421 |         words = sorted(words, key=lambda x: float(x['wordmin']))
422 | 
423 |         # Get per-word emphasis scores
424 |         j = 0
425 |         scores = torch.zeros(len(alignment))
426 |         for i, word in enumerate(alignment):
427 | 
428 |             # Keep silences as zero
429 |             if str(word) == pypar.SILENCE:
430 |                 continue
431 | 
432 |             # Make sure alignments are aligned
433 |             assert str(word).lower() == words[j]['word'].lower()
434 |             assert (word.start() - float(words[j]['wordmin'])) < 1e-4
435 |             assert (word.end() - float(words[j]['wordmax'])) < 1e-4
436 | 
437 |             # Update scores
438 |             # pa.32 is the average of 32 human judgments of the perception of
439 |             # prominence based on acoustic features
440 |             scores[i] = float(words[j]['pa.32'])
441 | 
442 |             j += 1
443 | 
444 |         # Save scores
445 |         torch.save(scores, cache_directory / 'scores' / f'{file.stem}.pt')
446 | 
447 | 
448 | def libritts():
449 |     """Download libritts dataset"""
450 |     # Setup source directory
451 |     source_directory = emphases.SOURCE_DIR / 'libritts'
452 |     source_directory.mkdir(exist_ok=True, parents=True)
453 | 
454 |     # Download
455 |     url = 'https://us.openslr.org/resources/60/train-clean-100.tar.gz'
456 |     file = source_directory / 'libritts-train-clean-100.tar.gz'
457 |     torchutil.download.file(url, file)
458 | 
459 |     # Unzip
460 |     with tarfile.open(file, 'r:gz') as tfile:
461 |         tfile.extractall(emphases.DATA_DIR)
462 | 
463 |     # Rename folder
464 |     directory = emphases.DATA_DIR / 'libritts'
465 |     shutil.rmtree(directory, ignore_errors=True)
466 |     shutil.move(emphases.DATA_DIR / 'LibriTTS', directory)
467 | 
468 |     # Download annotations from zenodo
469 |     url = 'https://zenodo.org/records/10402793/files/libritts-emphasis-annotations.json?download=1'
470 |     file = source_directory / 'annotations.json'
471 |     torchutil.download.file(url, file)
472 | 
473 |     # Load annotations
474 |     with open(source_directory / 'annotations.json') as file:
475 |         annotations = json.load(file)
476 | 
477 |     # Merge annotations to floats
478 |     annotations = merge_annotations(annotations)
479 | 
480 |     # Get list of audio files
481 |     audio_files = list(directory.rglob('*.wav'))
482 |     audio_files = [
483 |         file for file in audio_files if file.stem in annotations['stems']]
484 | 
485 |     # Setup cache directory
486 |     cache_directory = emphases.CACHE_DIR / 'libritts'
487 |     cache_directory.mkdir(exist_ok=True, parents=True)
488 | 
489 |     # Create subdirectories
490 |     features = ['alignment', 'audio', 'scores']
491 |     for feature in features:
492 |         (cache_directory / feature).mkdir(exist_ok=True, parents=True)
493 | 
494 |     # Iterate over files
495 |     for audio_file in torchutil.iterator(
496 |         audio_files,
497 |         'Formatting libritts',
498 |         total=len(audio_files)
499 |     ):
500 | 
501 |         # Load and resample audio
502 |         audio = emphases.load.audio(audio_file)
503 | 
504 |         # If audio is too quiet, increase the volume
505 |         maximum = torch.abs(audio).max()
506 |         if maximum < .35:
507 |             audio *= .35 / maximum
508 | 
509 |         # Save audio
510 |         stem = audio_file.stem
511 |         torchaudio.save(
512 |             cache_directory / 'audio' / f'{stem}.wav',
513 |             audio,
514 |             emphases.SAMPLE_RATE)
515 | 
516 |     # Align text and audio
517 |     text_files = [
518 |         file.with_suffix('.normalized.txt') for file in audio_files]
519 |     alignment_files = [
520 |         cache_directory / 'alignment' / f'{file.stem}.TextGrid'
521 |         for file in audio_files]
522 |     pyfoal.from_files_to_files(
523 |         text_files,
524 |         audio_files,
525 |         alignment_files,
526 |         'p2fa')
527 | 
528 |     for i, stem in enumerate([file.stem for file in audio_files]):
529 | 
530 |         # Load alignment
531 |         alignment = pypar.Alignment(
532 |             cache_directory / 'alignment' / f'{stem}.TextGrid')
533 | 
534 |         # Get ground truth
535 |         count = annotations['stems'][stem]
536 |         labels = [score / count for score in annotations['scores'][stem]]
537 | 
538 |         # Match alignment and scores (silences get a score of zero)
539 |         j = 0
540 |         scores = torch.zeros(len(alignment))
541 |         for i, word in enumerate(alignment):
542 | 
543 |             # Keep silences as zero
544 |             if str(word) == pypar.SILENCE:
545 |                 continue
546 | 
547 |             # Update scores
548 |             scores[i] = float(labels[j])
549 | 
550 |             j += 1
551 | 
552 |         # Save scores
553 |         torch.save(scores, cache_directory / 'scores' / f'{stem}.pt')
554 | 
555 | 
556 | ###############################################################################
557 | # Utilities
558 | ###############################################################################
559 | 
560 | 
561 | def download_file(url, file):
562 |     """Download file from url"""
563 |     with urllib.request.urlopen(url, context=ssl.SSLContext()) as response, \
564 |             open(file, 'wb') as output:
565 |         shutil.copyfileobj(response, output)
566 | 
567 | 
568 | def merge_annotations(annotations):
569 |     """Merge crowdsourced annotations"""
570 |     merged = {'samples': 0, 'scores': {}, 'stems': {}}
571 |     for _, responses in annotations.items():
572 | 
573 |         # Iterate over stems
574 |         for response in responses['annotations']:
575 |             stem = response['stem']
576 |             score = [float(c) for c in list(response['score'])]
577 | 
578 |             # Merge stem annotations
579 |             if stem in merged['stems']:
580 | 
581 |                 # Maybe cap the number of allowed annotations
582 |                 if (
583 |                     emphases.MAX_ANNOTATIONS is not None and
584 |                     merged['stems'][stem] == emphases.MAX_ANNOTATIONS
585 |                 ):
586 |                     continue
587 | 
588 |                 # Update sums and counts
589 |                 for i in range(len(score)):
590 |                     merged['scores'][stem][i] += score[i]
591 |                 merged['stems'][stem] += 1
592 | 
593 |             # Add new stem
594 |             else:
595 |                 merged['scores'][stem] = score
596 |                 merged['stems'][stem] = 1
597 | 
598 |             # Update total number of samples
599 |             merged['samples'] += 1
600 | 
601 |     # Maybe cap the minimum required annotations
602 |     if emphases.MIN_ANNOTATIONS is not None:
603 |         merged['stems'] = {
604 |             stem: count for stem, count in merged['stems'].items()
605 |             if count == emphases.MIN_ANNOTATIONS}
606 |         merged['scores'] = {
607 |             stem: scores for stem, scores in merged['scores'].items()
608 |             if stem in merged['stems']}
609 | 
610 |     return merged
611 | 


--------------------------------------------------------------------------------
/emphases/data/loader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Dataloader
 8 | ###############################################################################
 9 | 
10 | 
11 | def loader(dataset, partition=None, gpu=None):
12 |     """Retrieve a data loader"""
13 |     # Get dataset
14 |     dataset = emphases.data.Dataset(dataset, partition)
15 | 
16 |     # Get sampler
17 |     sampler = emphases.data.sampler(dataset, partition)
18 | 
19 |     # Create loader
20 |     return torch.utils.data.DataLoader(
21 |         dataset,
22 |         num_workers=emphases.NUM_WORKERS,
23 |         pin_memory=gpu is not None,
24 |         collate_fn=emphases.data.collate,
25 |         batch_sampler=sampler)
26 | 


--------------------------------------------------------------------------------
/emphases/data/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from . import mels
3 | from . import loudness
4 | 


--------------------------------------------------------------------------------
/emphases/data/preprocess/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Entry point
 8 | ###############################################################################
 9 | 
10 | 
11 | def parse_args():
12 |     """Parse command-line arguments"""
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument(
15 |         '--datasets',
16 |         nargs='+',
17 |         default=emphases.DATASETS,
18 |         help='The datasets to preprocess')
19 |     parser.add_argument(
20 |         '--gpu',
21 |         type=int,
22 |         help='The index of the gpu to run inference on')
23 |     return parser.parse_known_args()[0]
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     emphases.data.preprocess.datasets(**vars(parse_args()))
28 | 


--------------------------------------------------------------------------------
/emphases/data/preprocess/core.py:
--------------------------------------------------------------------------------
  1 | import penn
  2 | import torch
  3 | import torchutil
  4 | 
  5 | import emphases
  6 | 
  7 | 
  8 | ###############################################################################
  9 | # Preprocess
 10 | ###############################################################################
 11 | 
 12 | 
 13 | @torchutil.notify('preprocess')
 14 | def datasets(datasets, gpu=None):
 15 |     """Preprocess datasets"""
 16 |     for dataset in datasets:
 17 |         cache_directory = emphases.CACHE_DIR / dataset
 18 | 
 19 |         # Get audio files, from cache
 20 |         audio_files = sorted(cache_directory.rglob('*.wav'))
 21 | 
 22 |         # Preprocess mels
 23 |         mel_files = [
 24 |             cache_directory / 'mels' / f'{file.stem}.pt'
 25 |             for file in audio_files]
 26 |         emphases.data.preprocess.mels.from_files_to_files(
 27 |             audio_files,
 28 |             mel_files)
 29 | 
 30 |         # Preprocess loudness
 31 |         loudness_files = [
 32 |             cache_directory / 'loudness' / f'{file.stem}.pt'
 33 |             for file in audio_files]
 34 |         emphases.data.preprocess.loudness.from_files_to_files(
 35 |             audio_files,
 36 |             loudness_files)
 37 | 
 38 |         # Preprocess pitch, periodicity
 39 |         (cache_directory / 'pitch').mkdir(exist_ok=True, parents=True)
 40 |         pitch_files = [
 41 |             cache_directory / 'pitch' / f'{file.stem}'
 42 |             for file in audio_files]
 43 |         penn.from_files_to_files(
 44 |             audio_files,
 45 |             pitch_files,
 46 |             hopsize=emphases.convert.samples_to_seconds(emphases.HOPSIZE),
 47 |             fmin=emphases.FMIN,
 48 |             fmax=emphases.FMAX,
 49 |             batch_size=2048,
 50 |             center='half-hop',
 51 |             interp_unvoiced_at=emphases.VOICED_THRESHOLD,
 52 |             num_workers=emphases.NUM_WORKERS,
 53 |             gpu=gpu)
 54 | 
 55 |         # Pitch and periodicity use floating-point hopsize, while mels and
 56 |         # loudness use an integer hopsize in samples. This results in
 57 |         # single-frame differences when the audio length is within one sample
 58 |         # of a new frame due to floating-point error. We simply remove the last
 59 |         # frame in this rare case.
 60 |         for loudness_file, pitch_file in zip(loudness_files, pitch_files):
 61 |             loudness = torch.load(loudness_file)
 62 |             pitch = torch.load(f'{pitch_file}-pitch.pt')
 63 |             periodicity = torch.load(f'{pitch_file}-periodicity.pt')
 64 |             if pitch.shape[1] == loudness.shape[1] + 1:
 65 |                 pitch = pitch[:, :-1]
 66 |                 periodicity = periodicity[:, :-1]
 67 |             torch.save(pitch, f'{pitch_file}-pitch.pt')
 68 |             torch.save(periodicity, f'{pitch_file}-periodicity.pt')
 69 | 
 70 | 
 71 | def from_audio(audio, gpu=None):
 72 |     """Preprocess one audio file"""
 73 |     # Move to device (no-op if devices are the same)
 74 |     audio = audio.to('cpu' if gpu is None else f'cuda:{gpu}')
 75 | 
 76 |     features = []
 77 | 
 78 |     # Preprocess mels
 79 |     if emphases.MEL_FEATURE:
 80 |         features.append(emphases.data.preprocess.mels.from_audio(audio))
 81 | 
 82 |     # Preprocess pitch and periodicity
 83 |     if emphases.PITCH_FEATURE or emphases.PERIODICITY_FEATURE:
 84 |         pitch, periodicity = penn.from_audio(
 85 |             audio,
 86 |             emphases.SAMPLE_RATE,
 87 |             hopsize=emphases.convert.samples_to_seconds(emphases.HOPSIZE),
 88 |             fmin=emphases.FMIN,
 89 |             fmax=emphases.FMAX,
 90 |             pad=True,
 91 |             interp_unvoiced_at=emphases.VOICED_THRESHOLD,
 92 |             gpu=gpu)
 93 | 
 94 |         if emphases.PITCH_FEATURE:
 95 |             if emphases.NORMALIZE:
 96 |                 features.append(
 97 |                     (torch.log2(pitch) - emphases.LOGFMIN) /
 98 |                     (emphases.LOGFMAX - emphases.LOGFMIN))
 99 |             else:
100 |                 features.append(torch.log2(pitch))
101 | 
102 |         if emphases.PERIODICITY_FEATURE:
103 |             features.append(periodicity)
104 | 
105 |         # Pitch and periodicity use floating-point hopsize, while mels and
106 |         # loudness use an integer hopsize in samples. This results in
107 |         # single-frame differences when the audio length is within one sample
108 |         # of a new frame due to floating-point error. We simply repeat the last
109 |         # frame in this rare case.
110 |         frames = emphases.convert.samples_to_frames(audio.shape[-1])
111 |         if pitch.shape[1] == frames + 1:
112 |             pitch = pitch[:, :-1]
113 |             periodicity = periodicity[:, :-1]
114 | 
115 |     # Preprocess loudness
116 |     if emphases.LOUDNESS_FEATURE:
117 |         loudness = emphases.data.preprocess.loudness.from_audio(
118 |             audio,
119 |             emphases.SAMPLE_RATE)
120 |         features.append(loudness.to(audio.device))
121 | 
122 |     # Concatenate features
123 |     features = features[0] if len(features) == 1 else torch.cat(features)
124 | 
125 |     return features[None]
126 | 


--------------------------------------------------------------------------------
/emphases/data/preprocess/loudness.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | import warnings
  3 | 
  4 | import librosa
  5 | import numpy as np
  6 | import penn
  7 | import torch
  8 | import torchutil
  9 | 
 10 | import emphases
 11 | 
 12 | 
 13 | ###############################################################################
 14 | # Interface
 15 | ###############################################################################
 16 | 
 17 | 
 18 | def from_audio(audio, sample_rate=emphases.SAMPLE_RATE):
 19 |     """Compute mels from audio"""
 20 |     # Mayble resample
 21 |     audio = emphases.resample(audio, sample_rate)
 22 | 
 23 |     # Compute loudness
 24 |     return a_weighted(audio, sample_rate, hop_length=emphases.HOPSIZE)
 25 | 
 26 | 
 27 | def from_file(audio_file):
 28 |     """Load audio and compute mels"""
 29 |     audio = emphases.load.audio(audio_file)
 30 | 
 31 |     # Compute loudness
 32 |     return from_audio(audio)
 33 | 
 34 | 
 35 | def from_file_to_file(audio_file, output_file):
 36 |     """Compute loudness from audio file and save to disk"""
 37 |     loudness = from_file(audio_file)
 38 | 
 39 |     # Save to disk
 40 |     output_file.parent.mkdir(exist_ok=True, parents=True)
 41 |     torch.save(loudness, output_file)
 42 | 
 43 | 
 44 | def from_files_to_files(audio_files, output_files):
 45 |     """Compute loudness for many files and save to disk"""
 46 |     torchutil.multiprocess_iterator(
 47 |         wrapper,
 48 |         zip(audio_files, output_files),
 49 |         'Preprocessing a-weighted loudness',
 50 |         total=len(audio_files),
 51 |         num_workers=emphases.NUM_WORKERS)
 52 | 
 53 | 
 54 | ###############################################################################
 55 | # Loudness
 56 | ###############################################################################
 57 | 
 58 | 
 59 | def a_weighted(audio, sample_rate, hop_length=None, pad=False):
 60 |     """Retrieve the per-frame loudness"""
 61 |     # Save device
 62 |     device = audio.device
 63 | 
 64 |     # Default hop length of 10 ms
 65 |     hop_length = sample_rate // 100 if hop_length is None else hop_length
 66 | 
 67 |     if audio.dim() == 2:
 68 |         audio = audio[:, None, :]
 69 |     elif audio.dim() == 1:
 70 |         audio = audio[None, None, :]
 71 | 
 72 |     # Pad audio
 73 |     p = (emphases.NUM_FFT - emphases.HOPSIZE) // 2
 74 |     audio = torch.nn.functional.pad(audio, (p, p), "reflect").squeeze(1)
 75 | 
 76 |     # Convert to numpy
 77 |     audio = audio.detach().cpu().numpy().squeeze(0)
 78 | 
 79 |     # Cache weights
 80 |     if not hasattr(a_weighted, 'weights'):
 81 |         a_weighted.weights = perceptual_weights()
 82 | 
 83 |     # Take stft
 84 |     stft = librosa.stft(
 85 |         audio,
 86 |         n_fft=penn.WINDOW_SIZE,
 87 |         hop_length=hop_length,
 88 |         win_length=penn.WINDOW_SIZE,
 89 |         center=pad,
 90 |         pad_mode='constant')
 91 | 
 92 |     # Compute magnitude on db scale
 93 |     db = librosa.amplitude_to_db(np.abs(stft))
 94 | 
 95 |     # Apply A-weighting
 96 |     weighted = db + a_weighted.weights
 97 | 
 98 |     # Threshold
 99 |     weighted[weighted < emphases.MIN_DB] = emphases.MIN_DB
100 | 
101 |     # Average over weighted frequencies
102 |     loudness = torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None]
103 | 
104 |     # Scale to roughly [0, 1]
105 |     if emphases.NORMALIZE:
106 |         return (loudness + 100.) / 100.
107 |     return loudness
108 | 
109 | 
110 | def perceptual_weights():
111 |     """A-weighted frequency-dependent perceptual loudness weights"""
112 |     frequencies = librosa.fft_frequencies(
113 |         sr=penn.SAMPLE_RATE,
114 |         n_fft=penn.WINDOW_SIZE)
115 | 
116 |     # A warning is raised for nearly inaudible frequencies, but it ends up
117 |     # defaulting to -100 db. That default is fine for our purposes.
118 |     with warnings.catch_warnings():
119 |         warnings.simplefilter('ignore', RuntimeWarning)
120 |         return librosa.A_weighting(frequencies)[:, None] - emphases.REF_DB
121 | 
122 | def wrapper(item):
123 |     """Multiprocessing wrapper"""
124 |     from_file_to_file(*item)
125 | 


--------------------------------------------------------------------------------
/emphases/data/preprocess/mels.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | import os
  3 | 
  4 | import librosa
  5 | import torch
  6 | import torchutil
  7 | 
  8 | import emphases
  9 | 
 10 | 
 11 | ###############################################################################
 12 | # Mel spectrogram
 13 | ###############################################################################
 14 | 
 15 | 
 16 | def from_audio(audio):
 17 |     """Compute spectrogram from audio"""
 18 |     # Cache hann window
 19 |     if (
 20 |         not hasattr(from_audio, 'window') or
 21 |         from_audio.dtype != audio.dtype or
 22 |         from_audio.device != audio.device
 23 |     ):
 24 |         from_audio.window = torch.hann_window(
 25 |             emphases.WINDOW_SIZE,
 26 |             dtype=audio.dtype,
 27 |             device=audio.device)
 28 |         from_audio.dtype = audio.dtype
 29 |         from_audio.device = audio.device
 30 | 
 31 |     # Pad audio
 32 |     size = (emphases.NUM_FFT - emphases.HOPSIZE) // 2
 33 |     audio = torch.nn.functional.pad(
 34 |         audio,
 35 |         (size, size),
 36 |         mode='reflect')
 37 | 
 38 |     # Compute stft
 39 |     stft = torch.stft(
 40 |         audio.squeeze(1),
 41 |         emphases.NUM_FFT,
 42 |         hop_length=emphases.HOPSIZE,
 43 |         window=from_audio.window,
 44 |         center=False,
 45 |         normalized=False,
 46 |         onesided=True,
 47 |         return_complex=True)
 48 |     stft = torch.view_as_real(stft)[0]
 49 | 
 50 |     # Compute magnitude
 51 |     spectrogram = torch.sqrt(stft.pow(2).sum(-1) + 1e-6)
 52 | 
 53 |     # Convert to mels
 54 |     mels = linear_to_mel(spectrogram)
 55 | 
 56 |     # Scale to roughly [0, 1]
 57 |     if emphases.NORMALIZE:
 58 |         return (mels + 10.) / 10.
 59 |     return mels
 60 | 
 61 | 
 62 | def from_file(audio_file):
 63 |     """Load audio and compute mels"""
 64 |     audio = emphases.load.audio(audio_file)
 65 | 
 66 |     # Compute mels
 67 |     return from_audio(audio)
 68 | 
 69 | 
 70 | def from_file_to_file(audio_file, output_file):
 71 |     """Compute mels from audio file and save to disk"""
 72 |     mels = from_file(audio_file)
 73 | 
 74 |     # Save to disk
 75 |     output_file.parent.mkdir(exist_ok=True, parents=True)
 76 |     torch.save(mels, output_file)
 77 | 
 78 | 
 79 | def from_files_to_files(audio_files, output_files):
 80 |     """Compute mels for many files and save to disk"""
 81 |     torchutil.multiprocess_iterator(
 82 |         wrapper,
 83 |         zip(audio_files, output_files),
 84 |         'Preprocessing mels',
 85 |         total=len(audio_files),
 86 |         num_workers=emphases.NUM_WORKERS)
 87 | 
 88 | 
 89 | ###############################################################################
 90 | # Utilities
 91 | ###############################################################################
 92 | 
 93 | 
 94 | def linear_to_mel(spectrogram):
 95 |     # Create mel basis
 96 |     if not hasattr(linear_to_mel, 'mel_basis'):
 97 |         basis = librosa.filters.mel(
 98 |             sr=emphases.SAMPLE_RATE,
 99 |             n_fft=emphases.NUM_FFT,
100 |             n_mels=emphases.NUM_MELS)
101 |         basis = torch.from_numpy(basis)
102 |         basis = basis.to(spectrogram.dtype).to(spectrogram.device)
103 |         linear_to_mel.basis = basis
104 | 
105 |     # Convert to mels
106 |     melspectrogram = torch.matmul(linear_to_mel.basis, spectrogram)
107 | 
108 |     # Apply dynamic range compression
109 |     return torch.log(torch.clamp(melspectrogram, min=1e-5))
110 | 
111 | def wrapper(item):
112 |     """Multiprocessing wrapper"""
113 |     from_file_to_file(*item)
114 | 


--------------------------------------------------------------------------------
/emphases/data/sampler.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Sampler selection
 8 | ###############################################################################
 9 | 
10 | 
11 | def sampler(dataset, partition):
12 |     """Create batch sampler"""
13 |     # Deterministic random sampler for training
14 |     if partition in ['train', 'valid']:
15 |         return Sampler(dataset)
16 | 
17 |     # Sample validation and test data sequentially
18 |     elif partition.startswith('test'):
19 |         return torch.utils.data.BatchSampler(
20 |             torch.utils.data.SequentialSampler(dataset),
21 |             1,
22 |             False)
23 | 
24 |     else:
25 |         raise ValueError(f'Partition {partition} is not defined')
26 | 
27 | 
28 | ###############################################################################
29 | # Samplers
30 | ###############################################################################
31 | 
32 | 
33 | class Sampler:
34 | 
35 |     def __init__(self, dataset, max_frames=emphases.MAX_TRAINING_FRAMES):
36 |         self.max_frames = max_frames
37 |         self.epoch = 0
38 |         self.length = len(dataset)
39 |         self.buckets = dataset.buckets()
40 | 
41 |     def __iter__(self):
42 |         return iter(self.batch())
43 | 
44 |     def __len__(self):
45 |         return len(self.batch())
46 | 
47 |     def batch(self):
48 |         """Produces batch indices for one epoch"""
49 |         # Deterministic shuffling based on epoch
50 |         generator = torch.Generator()
51 |         generator.manual_seed(emphases.RANDOM_SEED + self.epoch)
52 | 
53 |         # Iterate over length-partitioned buckets
54 |         batches = []
55 |         for bucket in self.buckets:
56 | 
57 |             # Shuffle bucket
58 |             bucket = bucket[
59 |                 torch.randperm(len(bucket), generator=generator).tolist()]
60 | 
61 |             # Variable batch size
62 |             batch = []
63 |             max_length = 0
64 |             for index, length in bucket:
65 |                 max_length = max(max_length, length)
66 |                 if (
67 |                     batch and
68 |                     (len(batch) + 1) * max_length > self.max_frames
69 |                 ):
70 |                     batches.append(batch)
71 |                     max_length = length
72 |                     batch = [index]
73 |                 else:
74 |                     batch.append(index)
75 | 
76 |             # Don't drop last batch
77 |             if batch:
78 |                 batches.append(batch)
79 | 
80 |         # Shuffle
81 |         return [
82 |             batches[i] for i in
83 |             torch.randperm(len(batches), generator=generator).tolist()]
84 | 
85 |     def set_epoch(self, epoch):
86 |         self.epoch = epoch
87 | 


--------------------------------------------------------------------------------
/emphases/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from . import metrics
3 | from .metrics import Metrics
4 | 


--------------------------------------------------------------------------------
/emphases/evaluate/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import emphases
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Entry point
 9 | ###############################################################################
10 | 
11 | 
12 | def parse_args():
13 |     """Parse command-line arguments"""
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument(
16 |         '--datasets',
17 |         nargs='+',
18 |         default=emphases.EVALUATION_DATASETS,
19 |         help='The datasets to evaluate')
20 |     parser.add_argument(
21 |         '--checkpoint',
22 |         type=Path,
23 |         help='The checkpoint file to evaluate')
24 |     parser.add_argument(
25 |         '--gpu',
26 |         type=int,
27 |         help='The index of the GPU to use for evaluation')
28 | 
29 |     return parser.parse_known_args()[0]
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     emphases.evaluate.datasets(**vars(parse_args()))
34 | 


--------------------------------------------------------------------------------
/emphases/evaluate/core.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import torch
  4 | import torchutil
  5 | 
  6 | import emphases
  7 | 
  8 | 
  9 | ###############################################################################
 10 | # Evaluate
 11 | ###############################################################################
 12 | 
 13 | 
 14 | @torchutil.notify('evaluate')
 15 | def datasets(datasets, checkpoint=None, gpu=None):
 16 |     """Perform evaluation"""
 17 |     device = torch.device('cpu' if gpu is None else f'cuda:{gpu}')
 18 | 
 19 |     # Containers for results
 20 |     overall, granular = {}, {}
 21 | 
 22 |     # Evaluate each dataset
 23 |     for dataset in datasets:
 24 | 
 25 |         # Get data loader
 26 |         loader = emphases.data.loader(dataset, 'test', gpu)
 27 | 
 28 |         # Get mean and variance for Pearson Correlation
 29 |         target_stats = emphases.evaluate.metrics.Statistics()
 30 |         predicted_stats = emphases.evaluate.metrics.Statistics()
 31 |         for batch in loader:
 32 | 
 33 |             # Unpack
 34 |             _, _, _, word_lengths, targets, alignments, audio, _ = batch
 35 | 
 36 |             # Get predicted scores
 37 |             scores = emphases.from_alignment_and_audio(
 38 |                 alignments[0],
 39 |                 audio[0],
 40 |                 emphases.SAMPLE_RATE,
 41 |                 checkpoint=checkpoint,
 42 |                 gpu=gpu)
 43 | 
 44 |             # Update statistics
 45 |             target_stats.update(targets, word_lengths)
 46 |             predicted_stats.update(scores[None], word_lengths)
 47 | 
 48 |         # Get metric class
 49 |         metric_fn = emphases.evaluate.Metrics
 50 | 
 51 |         # Per-file metrics
 52 |         file_metrics = metric_fn(predicted_stats, target_stats)
 53 | 
 54 |         # Per-dataset metrics
 55 |         dataset_metrics = metric_fn(predicted_stats, target_stats)
 56 | 
 57 |         # Iterate over test set
 58 |         for batch in torchutil.iterator(
 59 |             loader,
 60 |             f'Evaluating {emphases.CONFIG} on {dataset}',
 61 |             total=len(loader)
 62 |         ):
 63 | 
 64 |             # Unpack
 65 |             (
 66 |                 _,
 67 |                 frame_lengths,
 68 |                 word_bounds,
 69 |                 word_lengths,
 70 |                 targets,
 71 |                 alignments,
 72 |                 audio,
 73 |                 stems
 74 |              ) = batch
 75 | 
 76 |             # Reset file metrics
 77 |             file_metrics.reset()
 78 | 
 79 |             if emphases.METHOD == 'neural':
 80 | 
 81 |                 # Get predicted scores
 82 |                 scores = []
 83 | 
 84 |                 # Preprocess audio
 85 |                 for features, word_bounds in emphases.preprocess(
 86 |                     alignments[0],
 87 |                     audio[0],
 88 |                     gpu=gpu
 89 |                 ):
 90 | 
 91 |                     # Infer
 92 |                     logits = emphases.infer(
 93 |                         features,
 94 |                         word_bounds,
 95 |                         checkpoint).detach()
 96 | 
 97 |                     # Skip postprocessing
 98 |                     scores.append(logits)
 99 | 
100 |                 # Concatenate results
101 |                 scores = torch.cat(scores, 2)
102 | 
103 |             else:
104 | 
105 |                 # Baseline method inference
106 |                 scores = emphases.from_alignment_and_audio(
107 |                     alignments[0],
108 |                     audio[0],
109 |                     emphases.SAMPLE_RATE,
110 |                     gpu=gpu)[None]
111 | 
112 |             # Update metrics
113 |             args = (scores, targets.to(device), word_lengths.to(device))
114 |             file_metrics.update(*args)
115 |             dataset_metrics.update(*args)
116 | 
117 |             # Copy results
118 |             granular[f'{dataset}/{stems[0]}'] = file_metrics()
119 |         overall[dataset] = dataset_metrics()
120 | 
121 |     # Write to json files
122 |     directory = emphases.EVAL_DIR / emphases.CONFIG
123 |     directory.mkdir(exist_ok=True, parents=True)
124 |     with open(directory / 'overall.json', 'w') as file:
125 |         json.dump(overall, file, indent=4)
126 |     with open(directory / 'granular.json', 'w') as file:
127 |         json.dump(granular, file, indent=4)
128 | 


--------------------------------------------------------------------------------
/emphases/evaluate/metrics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchutil
  3 | 
  4 | import emphases
  5 | 
  6 | 
  7 | ###############################################################################
  8 | # Aggregate metric
  9 | ###############################################################################
 10 | 
 11 | 
 12 | class Metrics:
 13 | 
 14 |     def __init__(self, predicted_stats, target_stats):
 15 |         self.correlation = torchutil.metrics.PearsonCorrelation(
 16 |             *predicted_stats(),
 17 |             *target_stats())
 18 |         self.bce = BinaryCrossEntropy()
 19 |         self.mse = MeanSquaredError()
 20 | 
 21 |     def __call__(self):
 22 |         return {
 23 |             'pearson_correlation': self.correlation(),
 24 |             'bce': self.bce(),
 25 |             'mse': self.mse()}
 26 | 
 27 |     def update(
 28 |         self,
 29 |         logits,
 30 |         targets,
 31 |         word_lengths):
 32 |         # Detach from graph
 33 |         logits = logits.detach()
 34 | 
 35 |         # Word resolution sequence mask
 36 |         mask = emphases.model.mask_from_lengths(word_lengths)
 37 |         logits, targets = logits[mask], targets[mask]
 38 | 
 39 |         # Update cross entropy
 40 |         self.bce.update(logits, targets)
 41 | 
 42 |         # Update squared error
 43 |         self.mse.update(emphases.postprocess(logits), targets)
 44 | 
 45 |         # Update pearson correlation
 46 |         self.correlation.update(emphases.postprocess(logits), targets)
 47 | 
 48 |     def reset(self):
 49 |         self.correlation.reset()
 50 |         self.bce.reset()
 51 |         self.mse.reset()
 52 | 
 53 | 
 54 | ###############################################################################
 55 | # Individual metrics
 56 | ###############################################################################
 57 | 
 58 | 
 59 | class BinaryCrossEntropy(torchutil.metrics.Average):
 60 | 
 61 |     def update(self, scores, targets):
 62 |         if emphases.LOSS == 'bce':
 63 | 
 64 |             # Get values from logits
 65 |             values = torch.nn.functional.binary_cross_entropy_with_logits(
 66 |                 scores,
 67 |                 targets,
 68 |                 reduction='none')
 69 | 
 70 |         else:
 71 | 
 72 |             # Get values from probabilities
 73 |             x, y = torch.clamp(scores, 0., 1.), targets
 74 |             values = -(
 75 |                 y * torch.log(x + 1e-6) + (1 - y) * torch.log(1 - x + 1e-6))
 76 | 
 77 |         # Update
 78 |         super().update(values, values.numel())
 79 | 
 80 | 
 81 | # TODO - fix scaling
 82 | class MeanSquaredError(torchutil.metrics.Average):
 83 | 
 84 |     def update(
 85 |         self,
 86 |         scores,
 87 |         targets):
 88 |         # Compute sum of MSE
 89 |         values = torch.nn.functional.mse_loss(
 90 |             scores,
 91 |             targets,
 92 |             reduction='none')
 93 | 
 94 |         # Update
 95 |         super().update(values, values.numel())
 96 | 
 97 | 
 98 | ###############################################################################
 99 | # Utilities
100 | ###############################################################################
101 | 
102 | 
103 | class Statistics(torchutil.metrics.MeanStd):
104 | 
105 |     def update(self, values, lengths):
106 |         # Sequence mask
107 |         mask = emphases.model.mask_from_lengths(lengths)
108 | 
109 |         # Update
110 |         super().update(values[mask].flatten().tolist())
111 | 


--------------------------------------------------------------------------------
/emphases/load.py:
--------------------------------------------------------------------------------
 1 | import torchaudio
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Loading utilities
 8 | ###############################################################################
 9 | 
10 | 
11 | def audio(file):
12 |     """Load audio and maybe resample"""
13 |     # Load
14 |     audio, sample_rate = torchaudio.load(file)
15 | 
16 |     # Maybe resample
17 |     return emphases.resample(audio, sample_rate)
18 | 


--------------------------------------------------------------------------------
/emphases/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from .layers import Layers
3 | 
4 | import emphases
5 | 


--------------------------------------------------------------------------------
/emphases/model/core.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | import emphases
  4 | 
  5 | 
  6 | ###############################################################################
  7 | # Model definition
  8 | ###############################################################################
  9 | 
 10 | 
 11 | class Model(torch.nn.Module):
 12 | 
 13 |     def __init__(self):
 14 |         super().__init__()
 15 | 
 16 |         # Input projection
 17 |         self.input_layer = torch.nn.Conv1d(
 18 |             emphases.NUM_FEATURES,
 19 |             emphases.CHANNELS,
 20 |             kernel_size=emphases.ENCODER_KERNEL_SIZE,
 21 |             padding='same')
 22 | 
 23 |         # Frame encoder
 24 |         self.frame_encoder = emphases.model.Layers(
 25 |             kernel_size=emphases.ENCODER_KERNEL_SIZE)
 26 | 
 27 |         # If we are resampling within the model, initialize word decoder
 28 |         if emphases.DOWNSAMPLE_LOCATION in ['input', 'intermediate']:
 29 |             self.word_decoder = emphases.model.Layers(
 30 |             kernel_size=emphases.DECODER_KERNEL_SIZE)
 31 | 
 32 |         # Output projection
 33 |         self.output_layer = torch.nn.Conv1d(
 34 |             emphases.CHANNELS,
 35 |             1,
 36 |             kernel_size=emphases.DECODER_KERNEL_SIZE,
 37 |             padding='same')
 38 | 
 39 |     def forward(self, features, frame_lengths, word_bounds, word_lengths):
 40 | 
 41 |         if emphases.DOWNSAMPLE_LOCATION == 'input':
 42 | 
 43 |             # Segment acoustic features into word segments
 44 |             segments, bounds, lengths = emphases.segment(
 45 |                 features,
 46 |                 word_bounds,
 47 |                 word_lengths)
 48 | 
 49 |             # Embed frames
 50 |             frame_embeddings = self.frame_encoder(
 51 |                 self.input_layer(segments),
 52 |                 lengths)
 53 | 
 54 |             # Downsample
 55 |             if emphases.DOWNSAMPLE_METHOD == 'average':
 56 |                 word_embeddings = frame_embeddings.mean(dim=2, keepdim=True)
 57 |             elif emphases.DOWNSAMPLE_METHOD == 'max':
 58 |                 word_embeddings = frame_embeddings.max(
 59 |                     dim=2,
 60 |                     keepdim=True
 61 |                 ).values
 62 |             elif emphases.DOWNSAMPLE_METHOD == 'sum':
 63 |                 word_embeddings = frame_embeddings.sum(dim=2, keepdim=True)
 64 |             elif emphases.DOWNSAMPLE_METHOD == 'center':
 65 |                 word_embeddings = emphases.downsample(
 66 |                     frame_embeddings,
 67 |                     bounds,
 68 |                     torch.ones(
 69 |                         (len(lengths),),
 70 |                         dtype=torch.long,
 71 |                         device=lengths.device))
 72 |             else:
 73 |                 raise ValueError(
 74 |                     f'Interpolation method {emphases.DOWNSAMPLE_METHOD} is not defined')
 75 | 
 76 |             # Stitch together word segment embeddings
 77 |             mask = mask_from_lengths(word_lengths)
 78 |             word_embeddings = word_embeddings.squeeze(2).transpose(0, 1).reshape(
 79 |                 word_embeddings.shape[1],
 80 |                 word_bounds.shape[0],
 81 |                 word_bounds.shape[2]
 82 |             ).permute(1, 0, 2) * mask
 83 | 
 84 |             # Decode
 85 |             word_embeddings = self.word_decoder(
 86 |                 word_embeddings,
 87 |                 word_lengths)
 88 | 
 89 |         else:
 90 | 
 91 |             # Embed frames
 92 |             frame_embeddings = self.frame_encoder(
 93 |                 self.input_layer(features),
 94 |                 frame_lengths)
 95 | 
 96 |             if emphases.DOWNSAMPLE_LOCATION == 'intermediate':
 97 | 
 98 |                 # Downsample activations to word resolution
 99 |                 word_embeddings = emphases.downsample(
100 |                     frame_embeddings,
101 |                     word_bounds,
102 |                     word_lengths)
103 | 
104 |                 # Infer emphasis scores from word embeddings
105 |                 word_embeddings = self.word_decoder(
106 |                     word_embeddings,
107 |                     word_lengths)
108 | 
109 |             elif emphases.DOWNSAMPLE_LOCATION == 'loss':
110 | 
111 |                 # Downsample activations to word resolution
112 |                 word_embeddings = emphases.downsample(
113 |                     frame_embeddings,
114 |                     word_bounds,
115 |                     word_lengths)
116 | 
117 |             elif emphases.DOWNSAMPLE_LOCATION == 'inference':
118 | 
119 |                 if self.training:
120 | 
121 |                     # Return frame resolution prominence for framewise loss
122 |                     return self.output_layer(frame_embeddings)
123 | 
124 |                 else:
125 | 
126 |                     # Downsample activations to word resolution
127 |                     word_embeddings = emphases.downsample(
128 |                         frame_embeddings,
129 |                         word_bounds,
130 |                         word_lengths)
131 | 
132 |             else:
133 |                 raise ValueError(
134 |                     f'Downsample location {emphases.DOWNSAMPLE_LOCATION} ' +
135 |                     'not recognized')
136 | 
137 |         # Project to scalar
138 |         return self.output_layer(word_embeddings)
139 | 
140 | 
141 | ###############################################################################
142 | # Utilities
143 | ###############################################################################
144 | 
145 | 
146 | def mask_from_lengths(lengths):
147 |     """Create boolean mask from sequence lengths"""
148 |     x = torch.arange(lengths.max(), dtype=lengths.dtype, device=lengths.device)
149 |     return (x.unsqueeze(0) < lengths.unsqueeze(1)).unsqueeze(1)
150 | 


--------------------------------------------------------------------------------
/emphases/model/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | from .convolution import Convolution
 2 | from .transformer import Transformer
 3 | 
 4 | import emphases
 5 | 
 6 | 
 7 | def Layers(**kwargs):
 8 |     if emphases.ARCHITECTURE == 'convolution':
 9 |         return Convolution(**kwargs)
10 |     elif emphases.ARCHITECTURE == 'transformer':
11 |         return Transformer()
12 |     else:
13 |         raise ValueError(
14 |             f'Network layer {emphases.ARCHITECTURE} is not defined')
15 | 


--------------------------------------------------------------------------------
/emphases/model/layers/convolution.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | import torch
 4 | 
 5 | import emphases
 6 | 
 7 | 
 8 | ###############################################################################
 9 | # Convolution model
10 | ###############################################################################
11 | 
12 | 
13 | class Convolution(torch.nn.Sequential):
14 | 
15 |     def __init__(self, kernel_size=emphases.ENCODER_KERNEL_SIZE):
16 |         # Bind common parameters
17 |         conv_fn = functools.partial(
18 |             torch.nn.Conv1d,
19 |             kernel_size=kernel_size,
20 |             padding='same')
21 | 
22 |         # Layers
23 |         layers = []
24 |         channels = emphases.CHANNELS
25 |         for _ in range(emphases.LAYERS):
26 |             layers.extend((
27 |                 conv_fn(channels, channels),
28 |                 emphases.ACTIVATION_FUNCTION()))
29 |             if emphases.DROPOUT is not None:
30 |                 layers.append(torch.nn.Dropout(emphases.DROPOUT))
31 | 
32 |         # Register to Module
33 |         super().__init__(*layers)
34 | 
35 |     # Ignore sequence length parameter needed for Transformer model
36 |     def forward(self, x, _):
37 |         return super().forward(x)
38 | 


--------------------------------------------------------------------------------
/emphases/model/layers/transformer.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import torch
 4 | 
 5 | import emphases
 6 | 
 7 | 
 8 | ###############################################################################
 9 | # Transformer stack
10 | ###############################################################################
11 | 
12 | 
13 | class Transformer(torch.nn.Module):
14 | 
15 |     def __init__(self, num_layers=emphases.LAYERS, channels=emphases.CHANNELS):
16 |         super().__init__()
17 |         self.position = PositionalEncoding(channels, .1)
18 |         self.model = torch.nn.TransformerEncoder(
19 |             torch.nn.TransformerEncoderLayer(
20 |                 channels,
21 |                 2,
22 |                 dim_feedforward=emphases.CHANNELS),
23 |             num_layers)
24 | 
25 |     def forward(self, x, lengths):
26 |         mask = emphases.model.mask_from_lengths(lengths)
27 |         return self.model(
28 |             self.position(x.permute(2, 0, 1)),
29 |             src_key_padding_mask=~mask.squeeze(1)
30 |         ).permute(1, 2, 0)
31 | 
32 | 
33 | ###############################################################################
34 | # Utilities
35 | ###############################################################################
36 | 
37 | 
38 | class PositionalEncoding(torch.nn.Module):
39 | 
40 |     def __init__(self, channels, dropout=.1, max_len=5000):
41 |         super().__init__()
42 |         self.dropout = torch.nn.Dropout(p=dropout)
43 |         index = torch.arange(max_len).unsqueeze(1)
44 |         frequency = torch.exp(
45 |             torch.arange(0, channels, 2) * (-math.log(10000.0) / channels))
46 |         encoding = torch.zeros(max_len, 1, channels)
47 |         encoding[:, 0, 0::2] = torch.sin(index * frequency)
48 |         encoding[:, 0, 1::2] = torch.cos(index * frequency)
49 |         self.register_buffer('encoding', encoding)
50 | 
51 |     def forward(self, x):
52 |         return self.dropout(x + self.encoding[:x.size(0)])
53 | 


--------------------------------------------------------------------------------
/emphases/partition/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | 


--------------------------------------------------------------------------------
/emphases/partition/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import emphases
 4 | 
 5 | 
 6 | def parse_args():
 7 |     """Parse command-line arguments"""
 8 |     parser = argparse.ArgumentParser(description='Partition datasets')
 9 |     parser.add_argument(
10 |         '--datasets',
11 |         nargs='+',
12 |         default=emphases.DATASETS,
13 |         help='The datasets to partition')
14 |     return parser.parse_known_args()[0]
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     emphases.partition.datasets(**vars(parse_args()))
19 | 


--------------------------------------------------------------------------------
/emphases/partition/core.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import emphases
  4 | 
  5 | 
  6 | ###############################################################################
  7 | # Partition dataset
  8 | ###############################################################################
  9 | 
 10 | 
 11 | def datasets(datasets):
 12 |     """Partition datasets"""
 13 |     for dataset in datasets:
 14 | 
 15 |         # Check if partition already exists
 16 |         file = emphases.PARTITION_DIR / f'{dataset}.json'
 17 | 
 18 |         # Random seed
 19 |         random.seed(emphases.RANDOM_SEED)
 20 | 
 21 |         # Make partition
 22 |         if dataset == 'automatic':
 23 |             partition = automatic()
 24 |         elif dataset == 'buckeye':
 25 |             partition = buckeye()
 26 |         elif dataset == 'libritts':
 27 |             partition = libritts()
 28 |         elif dataset == 'crowdsource':
 29 |             partition = crowdsource()
 30 |         else:
 31 |             raise ValueError(f'Dataset {dataset} is not defined')
 32 | 
 33 |         # Save to disk
 34 |         file.parent.mkdir(exist_ok=True, parents=True)
 35 |         with open(file, 'w') as file:
 36 |             json.dump(partition, file, ensure_ascii=False, indent=4)
 37 | 
 38 | 
 39 | ###############################################################################
 40 | # Existing datasets
 41 | ###############################################################################
 42 | 
 43 | 
 44 | def buckeye():
 45 |     """Partition buckeye dataset"""
 46 |     # Get audio files
 47 |     directory = emphases.CACHE_DIR / 'buckeye'
 48 |     audio_files = directory.rglob('*.wav')
 49 | 
 50 |     # Get stems
 51 |     stems = [file.stem for file in audio_files]
 52 | 
 53 |     # Partition
 54 |     return {'train': [], 'valid': [], 'test': stems}
 55 | 
 56 | 
 57 | def libritts():
 58 |     """Partition libritts dataset"""
 59 |     # Get audio files
 60 |     directory = emphases.CACHE_DIR / 'libritts'
 61 |     audio_files = directory.rglob('*.wav')
 62 | 
 63 |     # Get stems
 64 |     stems = [file.stem for file in audio_files]
 65 | 
 66 |     # Shuffle stems
 67 |     random.seed(emphases.RANDOM_SEED)
 68 |     random.shuffle(stems)
 69 | 
 70 |     # Get split locations
 71 |     left = int(emphases.SPLIT_SIZE_TRAIN * len(stems))
 72 |     right = left + int(emphases.SPLIT_SIZE_VALID * len(stems))
 73 | 
 74 |     # Only train on specified eighth for scaling law experiments
 75 |     if emphases.ONE_EIGHTH_UTTERANCES:
 76 | 
 77 |         # Partition
 78 |         speakers = [str(s) for s in emphases.data.download.LIBRITTS_SPEAKERS]
 79 |         train = [stem for stem in stems if stem.split('_')[0] in speakers]
 80 |         valid = [stem for stem in stems[left:right] if stem not in train]
 81 |         test = [stem for stem in stems[right:] if stem not in train]
 82 | 
 83 |     else:
 84 | 
 85 |         # Partition
 86 |         train = stems[:left]
 87 |         valid = stems[left:right]
 88 |         test = stems[right:]
 89 | 
 90 |     # Maybe limit training set size
 91 |     if emphases.MAX_TRAINING_UTTERANCES is not None:
 92 |         train = train[:emphases.MAX_TRAINING_UTTERANCES]
 93 | 
 94 |     return {'train': train, 'valid': valid, 'test': test}
 95 | 
 96 | 
 97 | ###############################################################################
 98 | # Dataset creation
 99 | ###############################################################################
100 | 
101 | 
102 | def automatic():
103 |     """Partition dataset created from trained model"""
104 |     # Get audio files
105 |     directory = emphases.CACHE_DIR / 'automatic'
106 |     audio_files = directory.rglob('*.wav')
107 | 
108 |     # Get stems
109 |     stems = [file.stem for file in audio_files]
110 | 
111 |     # Shuffle stems
112 |     random.seed(emphases.RANDOM_SEED)
113 |     random.shuffle(stems)
114 | 
115 |     # Get split locations
116 |     left = int(emphases.SPLIT_SIZE_TRAIN * len(stems))
117 |     right = left + int(emphases.SPLIT_SIZE_VALID * len(stems))
118 | 
119 |     # Partition
120 |     return {
121 |         'train': stems[:left],
122 |         'valid': stems[left:right],
123 |         'test': stems[right:]}
124 | 
125 | 
126 | def crowdsource():
127 |     """Partition crowdsourced dataset"""
128 |     # Get audio files
129 |     directory = emphases.CACHE_DIR / 'crowdsource'
130 |     audio_files = directory.rglob('*.wav')
131 | 
132 |     # Get stems
133 |     stems = [file.stem for file in audio_files]
134 | 
135 |     # Shuffle stems
136 |     random.seed(emphases.RANDOM_SEED)
137 |     random.shuffle(stems)
138 | 
139 |     # Get split locations
140 |     left = int(emphases.SPLIT_SIZE_TRAIN * len(stems))
141 |     right = left + int(emphases.SPLIT_SIZE_VALID * len(stems))
142 | 
143 |     # Only train on specified eighth for scaling law experiments
144 |     if emphases.ONE_EIGHTH_UTTERANCES:
145 | 
146 |         # Partition
147 |         speakers = [str(s) for s in emphases.data.download.LIBRITTS_SPEAKERS]
148 |         train = [stem for stem in stems if stem.split('_')[0] in speakers]
149 |         valid = [stem for stem in stems[left:right] if stem not in train]
150 |         test = [stem for stem in stems[right:] if stem not in train]
151 | 
152 |     else:
153 | 
154 |         # Partition
155 |         train = stems[:left]
156 |         valid = stems[left:right]
157 |         test = stems[right:]
158 | 
159 |     # Maybe limit training set size
160 |     if emphases.MAX_TRAINING_UTTERANCES is not None:
161 |         train = train[:emphases.MAX_TRAINING_UTTERANCES]
162 | 
163 |     return {'train': train, 'valid': valid, 'test': test}
164 | 


--------------------------------------------------------------------------------
/emphases/plot/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | from . import scaling
3 | 


--------------------------------------------------------------------------------
/emphases/plot/core.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | ###############################################################################
 7 | # Plot prominence
 8 | ###############################################################################
 9 | 
10 | 
11 | def scores(alignment, scores, targets=None):
12 |     """Plot the aligned prominence scores"""
13 |     figure, axis = plt.subplots(figsize=(30, 5))
14 |     axis.set_axis_off()
15 |     axis.set_ylim([0., 1.])
16 | 
17 |     # Get words, start times, and durations
18 |     centers = [word.start() + word.duration() / 2. for word in alignment]
19 |     duration = [word.duration() for word in alignment]
20 | 
21 |     # Plot scores
22 |     axis.bar(
23 |         centers,
24 |         scores,
25 |         duration,
26 |         edgecolor='black')
27 | 
28 |     # Plot words and dividers
29 |     for word in alignment:
30 |         axis.text(
31 |             word.start() + word.duration() / 2,
32 |             .015,
33 |             str(word),
34 |             fontsize=10,
35 |             rotation=90,
36 |             horizontalalignment='center')
37 |         axis.axvline(
38 |             word.start(),
39 |             color='gray',
40 |             linewidth=.5,
41 |             ymin=0.,
42 |             ymax=1.,
43 |             clip_on=False,
44 |             linestyle='--')
45 |     axis.axvline(
46 |         alignment.duration(),
47 |         color='gray',
48 |         linewidth=.5,
49 |         ymin=0.,
50 |         ymax=1.,
51 |         clip_on=False,
52 |         linestyle='--')
53 | 
54 |     if targets is not None:
55 | 
56 |         # Plot targets
57 |         axis.bar(centers, targets, duration)
58 | 
59 |         # Plot overlap
60 |         overlap = torch.minimum(scores, targets)
61 |         axis.bar(centers, overlap, duration, color='gray')
62 | 
63 |     return figure
64 | 


--------------------------------------------------------------------------------
/emphases/plot/scaling/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *


--------------------------------------------------------------------------------
/emphases/plot/scaling/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from pathlib import Path
 3 | 
 4 | import emphases
 5 | 
 6 | 
 7 | ###############################################################################
 8 | # Scaling laws plot
 9 | ###############################################################################
10 | 
11 | 
12 | def parse_args():
13 |     """Parse command-line arguments"""
14 |     parser = argparse.ArgumentParser(
15 |         description='Create scaling law figure')
16 |     parser.add_argument(
17 |         '--evaluations',
18 |         type=str,
19 |         nargs='+',
20 |         required=True,
21 |         help='The evaluations to plot')
22 |     parser.add_argument(
23 |         '--xlabel',
24 |         type=str,
25 |         required=True,
26 |         help='Label for x axis')
27 |     parser.add_argument(
28 |         '--output_file',
29 |         type=Path,
30 |         required=True,
31 |         help='The output jpg file')
32 |     parser.add_argument(
33 |         '--yticks',
34 |         type=float,
35 |         nargs='+',
36 |         required=True,
37 |         help='The y axis tick mark locations')
38 |     parser.add_argument(
39 |         '--sizes',
40 |         type=int,
41 |         nargs='+',
42 |         help='The number of utterances used in each evaluation')
43 |     parser.add_argument(
44 |         '--scores',
45 |         type=float,
46 |         nargs='+',
47 |         help='The Pearson Correlation y values')
48 |     parser.add_argument(
49 |         '--steps',
50 |         type=int,
51 |         nargs='+',
52 |         help='The number of training steps')
53 |     parser.add_argument(
54 |         '--text_offsets',
55 |         type=float,
56 |         nargs='+',
57 |         help='The amount to space the text below the plot point')
58 |     return parser.parse_args()
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     emphases.plot.scaling.scaling_laws(**vars(parse_args()))
63 | 


--------------------------------------------------------------------------------
/emphases/plot/scaling/core.py:
--------------------------------------------------------------------------------
 1 | import matplotlib
 2 | import matplotlib.pyplot as plt
 3 | import torch
 4 | 
 5 | import emphases
 6 | 
 7 | 
 8 | ###############################################################################
 9 | # Plot scaling laws
10 | ###############################################################################
11 | 
12 | 
13 | def scaling_laws(
14 |     evaluations,
15 |     xlabel,
16 |     output_file,
17 |     yticks,
18 |     scores=None,
19 |     steps=None,
20 |     sizes=None,
21 |     text_offsets=None):
22 |     """Plot scaling laws"""
23 |     # Load evaluation results
24 |     if scores is None or steps is None:
25 |         scores, steps = [], []
26 |         for evaluation in evaluations:
27 |             path, score = emphases.checkpoint.best_path(
28 |                 emphases.RUNS_DIR / evaluation)
29 |             checkpoint = torch.load(path, map_location='cpu')
30 |             scores.append(score)
31 |             steps.append(checkpoint['step'])
32 | 
33 |     # Get x values
34 |     x = [int(eval.split('-')[-1]) for eval in evaluations]
35 | 
36 |     # Create plot
37 |     figure, axis = plt.subplots(figsize=(8, 2))
38 | 
39 |     # Remove frame
40 |     axis.spines['top'].set_visible(False)
41 |     axis.spines['right'].set_visible(False)
42 |     axis.spines['bottom'].set_visible(False)
43 |     axis.spines['left'].set_visible(False)
44 | 
45 |     # Format x axis
46 |     x_range = max(x) - min(x)
47 |     axis.set_xlim([0, max(x) + 0.1 * x_range])
48 |     axis.get_xaxis().set_ticks(x)
49 |     axis.set_xlabel(xlabel)
50 |     axis.xaxis.set_ticks(x)
51 |     axis.xaxis.set_ticklabels(x)
52 | 
53 |     # Format y axis
54 |     axis.get_yaxis().set_ticks(yticks)
55 |     axis.set_ylim([min(yticks) - .002, max(yticks) + .002])
56 |     axis.tick_params(axis=u'both', which=u'both',length=0)
57 |     axis.set_ylabel('Pearson correlation')
58 | 
59 |     # Grid lines
60 |     for tick in yticks:
61 |         axis.axhline(tick, color='gray', linestyle='--', linewidth=.8)
62 | 
63 |     # Plot
64 |     colors = ['blue', 'orange', 'purple', 'red']
65 |     for i in range(len(x)):
66 |         axis.scatter(x[i], scores[i], color=colors[i])
67 | 
68 |     # Default text offset
69 |     if text_offsets is None:
70 |         text_offsets = [0.011] * len(evaluations)
71 | 
72 |     # Annotate
73 |     for i in range(len(evaluations)):
74 |         text = f'steps={steps[i]}'
75 |         if sizes is not None:
76 |             text += f'\nutterances={sizes[i]}'
77 |         axis.text(
78 |             x[i],
79 |             scores[i] - text_offsets[i],
80 |             text,
81 |             horizontalalignment='center')
82 | 
83 |     # Save
84 |     figure.savefig(output_file, bbox_inches='tight', pad_inches=0, dpi=300)
85 | 


--------------------------------------------------------------------------------
/emphases/train/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import *
2 | 


--------------------------------------------------------------------------------
/emphases/train/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import shutil
 3 | from pathlib import Path
 4 | 
 5 | import torchutil
 6 | 
 7 | import emphases
 8 | 
 9 | 
10 | ###############################################################################
11 | # Entry point
12 | ###############################################################################
13 | 
14 | 
15 | def main(config, dataset, gpu=None):
16 |     # Create output directory
17 |     directory = emphases.RUNS_DIR / config.stem
18 |     directory.mkdir(parents=True, exist_ok=True)
19 | 
20 |     # Save configuration
21 |     shutil.copyfile(config, directory / config.name)
22 | 
23 |     # Train
24 |     emphases.train(dataset, directory, gpu)
25 | 
26 |     # Get best checkpoint
27 |     checkpoint = torchutil.checkpoint.best_path(directory)[0]
28 | 
29 |     # Evaluate
30 |     emphases.evaluate.datasets(emphases.EVALUATION_DATASETS, checkpoint, gpu)
31 | 
32 | 
33 | def parse_args():
34 |     """Parse command-line arguments"""
35 |     parser = argparse.ArgumentParser(description='Train a model')
36 |     parser.add_argument(
37 |         '--config',
38 |         type=Path,
39 |         help='The configuration file')
40 |     parser.add_argument(
41 |         '--dataset',
42 |         default=emphases.TRAINING_DATASET,
43 |         help='The dataset to train on')
44 |     parser.add_argument(
45 |         '--gpu',
46 |         type=int,
47 |         help='The gpu to run training on')
48 |     return parser.parse_args()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main(**vars(parse_args()))
53 | 


--------------------------------------------------------------------------------
/emphases/train/core.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchutil
  3 | 
  4 | import emphases
  5 | 
  6 | 
  7 | ###############################################################################
  8 | # Training
  9 | ###############################################################################
 10 | 
 11 | 
 12 | @torchutil.notify('train')
 13 | def train(dataset, directory, gpu=None):
 14 |     """Train a model"""
 15 | 
 16 |     # Get torch device
 17 |     device = torch.device('cpu' if gpu is None else f'cuda:{gpu}')
 18 | 
 19 |     #######################
 20 |     # Create data loaders #
 21 |     #######################
 22 | 
 23 |     torch.manual_seed(emphases.RANDOM_SEED)
 24 | 
 25 |     # Training data
 26 |     train_loader = emphases.data.loader(dataset, 'train', gpu)
 27 | 
 28 |     # Validation data
 29 |     if emphases.VALIDATION_DATASET == 'buckeye':
 30 | 
 31 |         # This is just for generating scaling law plots for the paper
 32 |         valid_loader = emphases.data.loader('buckeye', 'test', gpu)
 33 | 
 34 |     else:
 35 | 
 36 |         valid_loader = emphases.data.loader(dataset, 'valid', gpu)
 37 | 
 38 |     ################
 39 |     # Create model #
 40 |     ################
 41 | 
 42 |     model = emphases.Model().to(device)
 43 | 
 44 |     ####################
 45 |     # Create optimizer #
 46 |     ####################
 47 | 
 48 |     optimizer = torch.optim.Adam(model.parameters())
 49 | 
 50 |     ##############################
 51 |     # Maybe load from checkpoint #
 52 |     ##############################
 53 | 
 54 |     path = torchutil.checkpoint.latest_path(directory)
 55 | 
 56 |     if path is not None:
 57 | 
 58 |         # Load model
 59 |         model, optimizer, state = torchutil.checkpoint.load(
 60 |             path,
 61 |             model,
 62 |             optimizer)
 63 |         epoch = state['epoch']
 64 |         step = state['step']
 65 |         score = state['score']
 66 |         best = state['best']
 67 | 
 68 |     else:
 69 | 
 70 |         # Train from scratch
 71 |         epoch, step, score, best = 0, 0, 0., 0.
 72 | 
 73 |     #########
 74 |     # Train #
 75 |     #########
 76 | 
 77 |     # Automatic mixed precision (amp) gradient scaler
 78 |     scaler = torch.cuda.amp.GradScaler()
 79 | 
 80 |     # Setup progress bar
 81 |     progress = torchutil.iterator(
 82 |         range(step, emphases.NUM_STEPS),
 83 |         f'Training {emphases.CONFIG}',
 84 |         step,
 85 |         emphases.NUM_STEPS)
 86 |     while step < emphases.NUM_STEPS:
 87 | 
 88 |         # Seed sampler
 89 |         train_loader.batch_sampler.set_epoch(epoch)
 90 | 
 91 |         for batch in train_loader:
 92 | 
 93 |             # Unpack batch
 94 |             (
 95 |                 features,
 96 |                 frame_lengths,
 97 |                 word_bounds,
 98 |                 word_lengths,
 99 |                 targets,
100 |                 _,           # alignment
101 |                 _,           # audio
102 |                 _            # stem
103 |             ) = batch
104 | 
105 |             # Copy to GPU
106 |             features = features.to(device)
107 |             frame_lengths = frame_lengths.to(device)
108 |             word_bounds = word_bounds.to(device)
109 |             word_lengths = word_lengths.to(device)
110 |             targets = targets.to(device)
111 |             with torch.autocast(device.type):
112 | 
113 |                 # Forward pass
114 |                 scores = model(
115 |                     features,
116 |                     frame_lengths,
117 |                     word_bounds,
118 |                     word_lengths)
119 | 
120 |                 # Compute loss
121 |                 train_loss = loss(
122 |                     scores,
123 |                     targets,
124 |                     frame_lengths,
125 |                     word_bounds,
126 |                     word_lengths,
127 |                     training=True)
128 | 
129 |             ##################
130 |             # Optimize model #
131 |             ##################
132 | 
133 |             optimizer.zero_grad()
134 | 
135 |             # Backward pass
136 |             scaler.scale(train_loss).backward()
137 | 
138 |             # Update weights
139 |             scaler.step(optimizer)
140 | 
141 |             # Update gradient scaler
142 |             scaler.update()
143 | 
144 |             ############
145 |             # Evaluate #
146 |             ############
147 | 
148 |             if step % emphases.LOG_INTERVAL == 0:
149 |                 score = evaluate(
150 |                     directory,
151 |                     step,
152 |                     model,
153 |                     gpu,
154 |                     'valid',
155 |                     valid_loader)
156 | 
157 |             ###################
158 |             # Save checkpoint #
159 |             ###################
160 | 
161 |             if step >= 300 and score > best:
162 |                 torchutil.checkpoint.save(
163 |                     directory / f'{step:08d}.pt',
164 |                     model,
165 |                     optimizer,
166 |                     epoch=epoch,
167 |                     step=step,
168 |                     score=score,
169 |                     best=best)
170 |                 best = score
171 | 
172 |             # End training after a certain number of steps
173 |             if step >= emphases.NUM_STEPS:
174 |                 break
175 | 
176 |             # Update training step count
177 |             step += 1
178 | 
179 |             # Update progress bar
180 |             progress.update()
181 | 
182 |         # Update epoch count
183 |         epoch += 1
184 | 
185 |     # Close progress bar
186 |     progress.close()
187 | 
188 |     # Save final model
189 |     torchutil.checkpoint.save(
190 |         directory / f'{step:08d}.pt',
191 |         model,
192 |         optimizer,
193 |         epoch=epoch,
194 |         step=step,
195 |         score=score,
196 |         best=best)
197 | 
198 | 
199 | ###############################################################################
200 | # Evaluation
201 | ###############################################################################
202 | 
203 | 
204 | def evaluate(directory, step, model, gpu, condition, loader):
205 |     """Perform model evaluation"""
206 |     device = 'cpu' if gpu is None else f'cuda:{gpu}'
207 | 
208 |     # Tensorboard audio and figures
209 |     waveforms, figures = {}, {}
210 | 
211 |     # Prepare model for inference
212 |     with emphases.inference_context(model):
213 | 
214 |         # Cache results to evaluate
215 |         results = []
216 |         for i, batch in enumerate(loader):
217 | 
218 |             # Unpack batch
219 |             (
220 |                 features,
221 |                 frame_lengths,
222 |                 word_bounds,
223 |                 word_lengths,
224 |                 targets,
225 |                 alignments,
226 |                 audio,
227 |                 stems
228 |             ) = batch
229 | 
230 |             # Copy to GPU
231 |             features = features.to(device)
232 |             frame_lengths = frame_lengths.to(device)
233 |             word_bounds = word_bounds.to(device)
234 |             word_lengths = word_lengths.to(device)
235 |             targets = targets.to(device)
236 | 
237 |             # Forward pass
238 |             logits = model(
239 |                 features,
240 |                 frame_lengths,
241 |                 word_bounds,
242 |                 word_lengths)
243 | 
244 |             # Cache results
245 |             results.append((
246 |                 logits.detach().cpu(),
247 |                 targets.detach().cpu(),
248 |                 word_lengths.detach().cpu()))
249 | 
250 |             # Add audio and figures
251 |             if condition == 'valid' and i < emphases.PLOT_EXAMPLES:
252 | 
253 |                 # Postprocess network output
254 |                 scores = emphases.postprocess(logits)
255 | 
256 |                 # Add audio
257 |                 samples = emphases.convert.frames_to_samples(frame_lengths[0])
258 |                 waveforms[f'audio/{stems[0]}'] = audio[0, :, :samples]
259 | 
260 |                 # Add figure
261 |                 figures[stems[0]] = emphases.plot.scores(
262 |                     alignments[0],
263 |                     scores[0, 0, :word_lengths[0]].cpu(),
264 |                     targets[0, 0, :word_lengths[0]].cpu())
265 | 
266 |             # Stop when we exceed some number of batches
267 |             if i + 1 == emphases.LOG_STEPS:
268 |                 break
269 | 
270 |         # Setup batch statistics
271 |         target_stats = emphases.evaluate.metrics.Statistics()
272 |         predicted_stats = emphases.evaluate.metrics.Statistics()
273 | 
274 |         # Update statistics
275 |         for logits, targets, word_lengths in results:
276 |             target_stats.update(
277 |                 targets.to(device),
278 |                 word_lengths.to(device))
279 |             predicted_stats.update(
280 |                 emphases.postprocess(logits.to(device)),
281 |                 word_lengths.to(device))
282 | 
283 |         # Setup evaluation metrics
284 |         metrics = emphases.evaluate.Metrics(predicted_stats, target_stats)
285 | 
286 |         # Update metrics
287 |         for logits, targets, word_lengths in results:
288 |             metrics.update(
289 |                 logits.to(device),
290 |                 targets.to(device),
291 |                 word_lengths.to(device))
292 | 
293 |     # Format results
294 |     scalars = {
295 |         f'{key}/{condition}': value for key, value in metrics().items()}
296 | 
297 |     # Write to tensorboard
298 |     torchutil.tensorboard.update(
299 |         directory,
300 |         step,
301 |         scalars=scalars,
302 |         figures=figures,
303 |         audio=waveforms,
304 |         sample_rate=emphases.SAMPLE_RATE)
305 | 
306 |     # Return Pearson correlation
307 |     return scalars[f'pearson_correlation/{condition}']
308 | 
309 | 
310 | ###############################################################################
311 | # Loss function
312 | ###############################################################################
313 | 
314 | 
315 | def loss(
316 |     scores,
317 |     targets,
318 |     frame_lengths,
319 |     word_bounds,
320 |     word_lengths,
321 |     training=False,
322 |     loss_fn=emphases.LOSS):
323 |     """Compute masked loss"""
324 |     if training and emphases.DOWNSAMPLE_LOCATION == 'inference':
325 | 
326 |         # If we are not downsampling the network output before the loss, we
327 |         # must upsample the targets
328 |         targets = emphases.upsample(
329 |             targets,
330 |             word_bounds,
331 |             word_lengths,
332 |             frame_lengths)
333 | 
334 |         # Linear interpolation can cause out-of-range
335 |         if emphases.UPSAMPLE_METHOD == 'linear':
336 |             targets = torch.clamp(targets, min=0., max=1.)
337 | 
338 |         # Frame resolution sequence mask
339 |         mask = emphases.model.mask_from_lengths(frame_lengths)
340 | 
341 |     else:
342 | 
343 |         # Word resolution sequence mask
344 |         mask = emphases.model.mask_from_lengths(word_lengths)
345 | 
346 |     # Compute masked loss
347 |     if loss_fn == 'bce':
348 |         return torch.nn.functional.binary_cross_entropy_with_logits(
349 |             scores[mask],
350 |             targets[mask])
351 |     elif loss_fn == 'mse':
352 |         return torch.nn.functional.mse_loss(scores[mask], targets[mask])
353 |     raise ValueError(f'Loss {loss_fn} is not recognized')
354 | 


--------------------------------------------------------------------------------
/eval/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/eval/.gitkeep


--------------------------------------------------------------------------------
/notebooks/select-speakers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "f473bbab-e880-4f10-be02-f2abf38ca9ad",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "%load_ext autoreload\n",
 11 |     "%autoreload 2"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "6e996f74-4c77-469a-a333-062febcaa78b",
 18 |    "metadata": {
 19 |     "tags": []
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import random\n",
 24 |     "\n",
 25 |     "import IPython.display as ipd\n",
 26 |     "import torchaudio\n",
 27 |     "\n",
 28 |     "import emphases"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "id": "9004ca30-9fa2-436b-ad2c-b778b895e6f6",
 35 |    "metadata": {
 36 |     "tags": []
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "dataset = 'libritts'\n",
 41 |     "directory = emphases.CACHE_DIR / dataset\n",
 42 |     "files = list(directory.rglob('*.wav'))"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "id": "7c23645c-95c3-45f7-8945-40ca6ff0c064",
 49 |    "metadata": {
 50 |     "tags": []
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "speakers = sorted(list(set(file.stem.split('_')[0] for file in files)))\n",
 55 |     "speaker_sizes = {speaker: 0. for speaker in speakers}\n",
 56 |     "for file in files:\n",
 57 |     "    info = torchaudio.info(file)\n",
 58 |     "    speaker_sizes[file.stem.split('_')[0]] += info.num_frames / info.sample_rate\n",
 59 |     "total = sum(speaker_sizes.values())\n",
 60 |     "total"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "id": "fe70a29b-d5fc-4a32-b08f-21c67776b420",
 67 |    "metadata": {
 68 |     "tags": []
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "current = [\n",
 73 |     "    # Top 5 Female\n",
 74 |     "    40,\n",
 75 |     "    669,\n",
 76 |     "    4362,\n",
 77 |     "    5022,\n",
 78 |     "    8123,\n",
 79 |     "    \n",
 80 |     "    # Additional female speakers to get to 1/8th \n",
 81 |     "    5022,\n",
 82 |     "    696,\n",
 83 |     "    6272,\n",
 84 |     "    5163,\n",
 85 |     "\n",
 86 |     "    # Top 5 Male\n",
 87 |     "    196,\n",
 88 |     "    460,\n",
 89 |     "    1355,\n",
 90 |     "    3664,\n",
 91 |     "    7067,  # uses character voices\n",
 92 |     "    \n",
 93 |     "    # Additional male speakers to get to 1/8th \n",
 94 |     "    405,\n",
 95 |     "    6437,\n",
 96 |     "    446,  # uses character voices\n",
 97 |     "    4397\n",
 98 |     "]\n",
 99 |     "current_total = sum(speaker_sizes[str(speaker)] for speaker in current) \n",
100 |     "current_total"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "id": "9abbbc3f-5d18-48d2-ae57-da36fa322da9",
107 |    "metadata": {
108 |     "tags": []
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "current_total / total / (1/8)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "id": "944611a7-7a50-4b86-b86d-e79a83d91f8d",
119 |    "metadata": {
120 |     "tags": []
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "speaker = 4397\n",
125 |     "files = [file for file in (directory / 'audio').rglob('*.wav') if file.stem.startswith(f'{speaker}_')]\n",
126 |     "ipd.display(ipd.Audio(random.choice(files)))"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "id": "f1c42bc9-95bc-4ceb-8a88-dc6b60867bb1",
133 |    "metadata": {
134 |     "tags": []
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "candidates = sorted(speaker_sizes.items(), key=lambda item: item[1], reverse=True)\n",
139 |     "candidates"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "id": "2e7925d3-6c35-459e-af2c-f9af74c21bf7",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": []
149 |   }
150 |  ],
151 |  "metadata": {
152 |   "kernelspec": {
153 |    "display_name": "env",
154 |    "language": "python",
155 |    "name": "env"
156 |   },
157 |   "language_info": {
158 |    "codemirror_mode": {
159 |     "name": "ipython",
160 |     "version": 3
161 |    },
162 |    "file_extension": ".py",
163 |    "mimetype": "text/x-python",
164 |    "name": "python",
165 |    "nbconvert_exporter": "python",
166 |    "pygments_lexer": "ipython3",
167 |    "version": "3.9.16"
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 5
172 | }
173 | 


--------------------------------------------------------------------------------
/results/scaling-annotators.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/results/scaling-annotators.pdf


--------------------------------------------------------------------------------
/results/scaling-data.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/results/scaling-data.pdf


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
  1 | # Runs experiments in the paper
  2 | # "Crowdsourced and Automatic Speech Prominence Estimation"
  3 | 
  4 | # Args
  5 | #  $1 - the GPU index
  6 | 
  7 | SCRIPTDIR="$( dirname -- "$0"; )"
  8 | 
  9 | ####################################
 10 | # Annotator redundancy experiments #
 11 | ####################################
 12 | 
 13 | 
 14 | # N.B. - These experiments require Buckeye for evaluation and are therefore
 15 | #        commented out (see note in README).
 16 | 
 17 | # # 1/64; 8 annotations
 18 | # rm -rf data/cache/crowdsource/*
 19 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/64-8.py
 20 | # python -m emphases.data.preprocess --gpu $1
 21 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/64-8.py
 22 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/64-8.py --gpu $1
 23 | 
 24 | # # 1/32; 4 annotations
 25 | # rm -rf data/cache/crowdsource/*
 26 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/32-4.py
 27 | # python -m emphases.data.preprocess --gpu $1
 28 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/32-4.py
 29 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/32-4.py --gpu $1
 30 | 
 31 | # # 1/16; 2 annotations
 32 | # rm -rf data/cache/crowdsource/*
 33 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/16-2.py
 34 | # python -m emphases.data.preprocess --gpu $1
 35 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/16-2.py
 36 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/16-2.py --gpu $1
 37 | 
 38 | # # 1/8; 1 annotations
 39 | # rm -rf data/cache/crowdsource/*
 40 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/8-1.py
 41 | # python -m emphases.data.preprocess --gpu $1
 42 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/8-1.py
 43 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/8-1.py --gpu $1
 44 | 
 45 | # # Plot results
 46 | # python -m emphases.plot.scaling \
 47 | #     --evaluations 8-1 16-2 32-4 64-8 \
 48 | #     --xlabel "Annotators per utterance" \
 49 | #     --output_file results/scaling-annotators.pdf \
 50 | #     --sizes 3200 1600 800 400 \
 51 | #     --scores 0.686 0.683 0.667 0.664 \
 52 | #     --steps 967 933 567 467 \
 53 | #     --yticks 0.66 0.67 0.68 0.69 \
 54 | #     --text_offsets 0.007 0.01 0.007 0.007
 55 | 
 56 | 
 57 | # ####################################
 58 | # # Dataset size scaling experiments #
 59 | # ####################################
 60 | 
 61 | 
 62 | # N.B. - These experiments require Buckeye for evaluation and are therefore
 63 | #        commented out (see note in README).
 64 | 
 65 | # # 400 utterances
 66 | # rm -rf data/cache/crowdsource/*
 67 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/400.py
 68 | # python -m emphases.data.preprocess --gpu $1
 69 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/400.py
 70 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/400.py --gpu $1
 71 | 
 72 | # # 800 utterances
 73 | # rm -rf data/cache/crowdsource/*
 74 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/800.py
 75 | # python -m emphases.data.preprocess --gpu $1
 76 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/800.py
 77 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/800.py --gpu $1
 78 | 
 79 | # # 1600 utterances
 80 | # rm -rf data/cache/crowdsource/*
 81 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/1600.py
 82 | # python -m emphases.data.preprocess --gpu $1
 83 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/1600.py
 84 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/1600.py --gpu $1
 85 | 
 86 | # # 3200 utterances
 87 | # rm -rf data/cache/crowdsource/*
 88 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/3200.py
 89 | # python -m emphases.data.preprocess --gpu $1
 90 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/3200.py
 91 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/3200.py --gpu $1
 92 | 
 93 | # # Plot results
 94 | # python -m emphases.plot.scaling \
 95 | #     --evaluations 400 800 1600 3200 \
 96 | #     --xlabel Utterances \
 97 | #     --output_file results/scaling-data.pdf \
 98 | #     --yticks 0.63 0.65 0.67 0.69 \
 99 | #     --scores 0.633 0.657 0.678 0.687 \
100 | #     --steps 400 500 767 1433 \
101 | #     --text_offsets 0.007 0.007 0.007 0.007
102 | 
103 | 
104 | ##############
105 | # Best model #
106 | ##############
107 | 
108 | 
109 | python -m emphases.data.download
110 | python -m emphases.data.preprocess --gpu $1
111 | python -m emphases.partition
112 | python -m emphases.train --config $SCRIPTDIR/config/base.py --gpu $1
113 | 
114 | 
115 | #############
116 | # Ablations #
117 | #############
118 | 
119 | 
120 | python -m emphases.train --config $SCRIPTDIR/config/hparam-search/mse.py --gpu $1
121 | 
122 | 
123 | ##############
124 | # Downsample #
125 | ##############
126 | 
127 | 
128 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-inference.py --gpu $1
129 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-intermediate.py --gpu $1
130 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-input.py --gpu $1
131 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-loss.py --gpu $1
132 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-inference.py --gpu $1
133 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-intermediate.py --gpu $1
134 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-input.py --gpu $1
135 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-loss.py --gpu $1
136 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-inference.py --gpu $1
137 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-intermediate.py --gpu $1
138 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-input.py --gpu $1
139 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-loss.py --gpu $1
140 | 
141 | 
142 | ####################################
143 | # Large-scale automatic annotation #
144 | ####################################
145 | 
146 | 
147 | python -m emphases.data.download --datasets automatic --gpu $1
148 | python -m emphases.partition --datasets automatic
149 | python -m emphases.data.preprocess --datasets automatic --gpu $1
150 | python -m emphases.train --config $SCRIPTDIR/config/scaling/base-automatic.py --dataset automatic --gpu $1
151 | 
152 | 
153 | #############
154 | # Baselines #
155 | #############
156 | 
157 | 
158 | python -m emphases.evaluate --config $SCRIPTDIR/config/baselines/prominence.py
159 | 


--------------------------------------------------------------------------------
/runs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/runs/.gitkeep


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | 
 4 | with open('README.md', encoding='utf8') as file:
 5 |     long_description = file.read()
 6 | 
 7 | 
 8 | setup(
 9 |     name='emphases',
10 |     description='Crowdsourced and Automatic Speech Prominence Estimation',
11 |     version='0.0.2',
12 |     author='Interactive Audio Lab',
13 |     author_email='interactiveaudiolab@gmail.com',
14 |     url='https://github.com/interactiveaudiolab/emphases',
15 |     install_requires=[
16 |         'GPUtil',
17 |         'huggingface-hub',
18 |         'librosa',
19 |         'matplotlib',
20 |         'numpy',
21 |         'penn',
22 |         'pycwt',
23 |         'pyfoal',
24 |         'pypar',
25 |         'pyyaml',
26 |         'reseval',
27 |         'scipy',
28 |         'torch',
29 |         'torchutil',
30 |         'torchaudio',
31 |         'yapecs'],
32 |     packages=find_packages(),
33 |     package_data={'emphases': ['assets/*', 'assets/*/*']},
34 |     long_description=long_description,
35 |     long_description_content_type='text/markdown',
36 |     keywords=['annotatation', 'audio', 'emphasis', 'prominence', 'speech'],
37 |     classifiers=['License :: OSI Approved :: MIT License'],
38 |     license='MIT')
39 | 


--------------------------------------------------------------------------------