├── .gitignore ├── LICENSE ├── README.md ├── config ├── base.py ├── baselines │ ├── duration-variance.py │ ├── pitch-variance.py │ └── prominence.py ├── downsample │ ├── average-inference.py │ ├── average-input.py │ ├── average-intermediate.py │ ├── average-loss.py │ ├── center-inference.py │ ├── center-input.py │ ├── center-intermediate.py │ ├── center-loss.py │ ├── max-inference.py │ ├── max-input.py │ ├── max-intermediate.py │ ├── max-loss.py │ ├── sum-inference.py │ ├── sum-input.py │ ├── sum-intermediate.py │ └── sum-loss.py ├── hparam-search │ ├── batch-060000.py │ ├── batch-070000.py │ ├── batch-075000.py │ ├── batch-100000.py │ ├── buckets-1.py │ ├── buckets-2.py │ ├── convolution-5-80.py │ ├── convolution-6-128.py │ ├── convolution-6-64.py │ ├── convolution-6-80.py │ ├── convolution-7-80.py │ ├── decoder-kernel-1.py │ ├── decoder-kernel-5.py │ ├── dropout-05.py │ ├── dropout-10.py │ ├── encoder-kernel-5.py │ ├── encoder-kernel-7.py │ ├── gelu.py │ ├── leaky-relu.py │ ├── mse.py │ └── silu.py └── scaling │ ├── 16-2.py │ ├── 1600.py │ ├── 32-4.py │ ├── 3200.py │ ├── 400.py │ ├── 64-8.py │ ├── 8-1.py │ ├── 800.py │ └── base-automatic.py ├── data ├── cache │ └── .gitkeep ├── datasets │ └── .gitkeep └── sources │ └── .gitkeep ├── emphases ├── __init__.py ├── __main__.py ├── annotate │ ├── __init__.py │ ├── __main__.py │ └── core.py ├── assets │ ├── checkpoints │ │ ├── .gitkeep │ │ └── checkpoint.pt │ ├── configs │ │ └── annotate.yaml │ └── partitions │ │ ├── .gitkeep │ │ ├── automatic.json │ │ ├── buckeye.json │ │ ├── crowdsource.json │ │ └── libritts.json ├── baselines │ ├── __init__.py │ ├── duration_variance │ │ ├── __init__.py │ │ └── core.py │ ├── pitch_variance │ │ ├── __init__.py │ │ └── core.py │ └── prominence │ │ ├── __init__.py │ │ ├── core.py │ │ ├── cwt_utils.py │ │ ├── duration_processing.py │ │ ├── energy_processing.py │ │ ├── f0_processing.py │ │ ├── filter.py │ │ ├── loma.py │ │ ├── pitch_tracker.py │ │ └── smooth_and_interp.py ├── config │ ├── __init__.py │ ├── defaults.py │ └── static.py ├── convert.py ├── core.py ├── data │ ├── __init__.py │ ├── collate.py │ ├── dataset.py │ ├── download │ │ ├── __init__.py │ │ ├── __main__.py │ │ └── core.py │ ├── loader.py │ ├── preprocess │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── core.py │ │ ├── loudness.py │ │ └── mels.py │ └── sampler.py ├── evaluate │ ├── __init__.py │ ├── __main__.py │ ├── core.py │ └── metrics.py ├── load.py ├── model │ ├── __init__.py │ ├── core.py │ └── layers │ │ ├── __init__.py │ │ ├── convolution.py │ │ └── transformer.py ├── partition │ ├── __init__.py │ ├── __main__.py │ └── core.py ├── plot │ ├── __init__.py │ ├── core.py │ └── scaling │ │ ├── __init__.py │ │ ├── __main__.py │ │ └── core.py └── train │ ├── __init__.py │ ├── __main__.py │ └── core.py ├── eval └── .gitkeep ├── notebooks ├── analyze-annotations.ipynb └── select-speakers.ipynb ├── results ├── scaling-annotators.pdf └── scaling-data.pdf ├── run.sh ├── runs └── .gitkeep └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/cache/* 2 | !data/cache/.gitkeep 3 | data/datasets/* 4 | !data/datasets/.gitkeep 5 | data/sources/* 6 | !data/sources/.gitkeep 7 | eval/* 8 | !eval/.gitkeep 9 | runs/* 10 | !runs/.gitkeep 11 | htk/ 12 | config/hyperparam-search/* 13 | utils/* 14 | 15 | __pycache__/ 16 | .DS_Store 17 | ._.DS_Store 18 | .ipynb_checkpoints/ 19 | .vscode/ 20 | *.egg-info/ 21 | */.ipynb_checkpoints/* 22 | dist/ 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Interactive Audio Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Crowdsourced and Automatic Speech Prominence Estimation

2 |
3 | 4 | [![PyPI](https://img.shields.io/pypi/v/emphases.svg)](https://pypi.python.org/pypi/emphases) 5 | [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) 6 | [![Downloads](https://static.pepy.tech/badge/emphases)](https://pepy.tech/project/emphases) 7 | 8 | Annotation, training, evaluation and inference of speech prominence 9 | 10 | [Paper](https://www.maxrmorrison.com/pdfs/morrison2024crowdsourced.pdf) [Website](https://www.maxrmorrison.com/sites/prominence-estimation) [Dataset](https://zenodo.org/records/10402793) 11 | 12 |
13 | 14 | 15 | ## Table of contents 16 | 17 | - [Installation](#installation) 18 | - [Inference](#inference) 19 | * [Application programming interface](#application-programming-interface) 20 | * [`emphases.from_alignment_and_audio`](#emphasesfrom_alignment_and_audio) 21 | * [`emphases.from_text_and_audio`](#emphasesfrom_text_and_audio) 22 | * [`emphases.from_file`](#emphasesfrom_file) 23 | * [`emphases.from_file_to_file`](#emphasesfrom_file_to_file) 24 | * [`emphases.from_files_to_files`](#emphasesfrom_files_to_files) 25 | * [Command-line interface](#command-line-interface) 26 | - [Training](#training) 27 | * [Download](#download) 28 | * [Annotate](#annotate) 29 | * [Preprocess](#preprocess) 30 | * [Partition](#partition) 31 | * [Train](#train) 32 | * [Monitor](#monitor) 33 | - [Evaluation](#reproducing-results) 34 | * [Evaluate](#evaluate) 35 | * [Analyze](#analyze) 36 | - [Citation](#citation) 37 | 38 | 39 | ## Installation 40 | 41 | `pip install emphases` 42 | 43 | By default, we use the Penn Phonetic Forced Aligner (P2FA) via the [`pyfoal`](https://github.com/maxrmorrison/pyfoal/) 44 | repo to perform word alignments. This requires installing HTK. See [the HTK 45 | installation instructions](https://github.com/maxrmorrison/pyfoal/tree/main?tab=readme-ov-file#penn-phonetic-forced-aligner-p2fa) 46 | provided by `pyfoal`. Alternatively, you can use a different forced aligner 47 | and either pass the alignment as a [`pypar.Alignment`](https://github.com/maxrmorrison/pypar/tree/main) 48 | object or save the alignment as a `.TextGrid` file. 49 | 50 | 51 | ## Inference 52 | 53 | Perform automatic emphasis annotation using our best pretrained model 54 | 55 | ```python 56 | import emphases 57 | 58 | # Text and audio of speech 59 | text_file = 'example.txt' 60 | audio_file = 'example.wav' 61 | 62 | # Detect emphases 63 | alignment, prominence = emphases.from_file(text_file, audio_file) 64 | 65 | # Check which words were emphasized 66 | for word, score in zip(alignment, prominence[0]): 67 | print(f'{word} has a prominence of {score}') 68 | ``` 69 | 70 | The `alignment` is a [`pypar.Alignment`](https://github.com/maxrmorrison/pypar) 71 | object. 72 | 73 | 74 | ### Application programming interface 75 | 76 | #### `emphases.from_alignment_and_audio` 77 | 78 | ```python 79 | def from_alignment_and_audio( 80 | alignment: pypar.Alignment, 81 | audio: torch.Tensor, 82 | sample_rate: int, 83 | checkpoint: Optional[Union[str, bytes, os.PathLike]] = None, 84 | batch_size: Optional[int] = None, 85 | gpu: Optional[int] = None 86 | ) -> Tuple[Type[pypar.Alignment], torch.Tensor]: 87 | """Produce emphasis scores for each word 88 | 89 | Args: 90 | alignment: The forced phoneme alignment 91 | audio: The speech waveform 92 | sample_rate: The audio sampling rate 93 | checkpoint: The model checkpoint to use for inference 94 | batch_size: The maximum number of frames per batch 95 | gpu: The index of the gpu to run inference on 96 | 97 | Returns: 98 | scores: The float-valued emphasis scores for each word 99 | """ 100 | ``` 101 | 102 | 103 | #### `emphases.from_text_and_audio` 104 | 105 | ```python 106 | def from_text_and_audio( 107 | text: str, 108 | audio: torch.Tensor, 109 | sample_rate: int, 110 | checkpoint: Optional[Union[str, bytes, os.PathLike]] = None, 111 | batch_size: Optional[int] = None, 112 | gpu: Optional[int] = None 113 | ) -> Tuple[Type[pypar.Alignment], torch.Tensor]: 114 | """Produce emphasis scores for each word 115 | 116 | Args: 117 | text: The speech transcript 118 | audio: The speech waveform 119 | sample_rate: The audio sampling rate 120 | checkpoint: The model checkpoint to use for inference 121 | batch_size: The maximum number of frames per batch 122 | gpu: The index of the gpu to run inference on 123 | 124 | Returns: 125 | alignment: The forced phoneme alignment 126 | scores: The float-valued emphasis scores for each word 127 | """ 128 | ``` 129 | 130 | 131 | #### `emphases.from_file` 132 | 133 | ```python 134 | def from_file( 135 | text_file: Union[str, bytes, os.PathLike], 136 | audio_file: Union[str, bytes, os.PathLike], 137 | checkpoint: Optional[Union[str, bytes, os.PathLike]] = None, 138 | batch_size: Optional[int] = None, 139 | gpu: Optional[int] = None 140 | ) -> Tuple[Type[pypar.Alignment], torch.Tensor]: 141 | """Produce emphasis scores for each word for files on disk 142 | 143 | Args: 144 | text_file: The speech transcript (.txt) or alignment (.TextGrid) file 145 | audio_file: The speech waveform audio file 146 | checkpoint: The model checkpoint to use for inference 147 | batch_size: The maximum number of frames per batch 148 | gpu: The index of the gpu to run inference on 149 | 150 | Returns: 151 | alignment: The forced phoneme alignment 152 | scores: The float-valued emphasis scores for each word 153 | """ 154 | ``` 155 | 156 | 157 | #### `emphases.from_file_to_file` 158 | 159 | ```python 160 | def from_file_to_file( 161 | text_file: List[Union[str, bytes, os.PathLike]], 162 | audio_file: List[Union[str, bytes, os.PathLike]], 163 | output_prefix: Optional[List[Union[str, bytes, os.PathLike]]] = None, 164 | checkpoint: Optional[Union[str, bytes, os.PathLike]] = None, 165 | batch_size: Optional[int] = None, 166 | gpu: Optional[int] = None 167 | ) -> None: 168 | """Produce emphasis scores for each word for files on disk and save to disk 169 | 170 | Args: 171 | text_file: The speech transcript (.txt) or alignment (.TextGrid) file 172 | audio_file: The speech waveform audio file 173 | output_prefix: The output prefix. Defaults to text file stem. 174 | checkpoint: The model checkpoint to use for inference 175 | batch_size: The maximum number of frames per batch 176 | gpu: The index of the gpu to run inference on 177 | """ 178 | ``` 179 | 180 | Emphases are saved as a list of five-tuples containing the word, start time, 181 | end time, a float-valued emphasis score, and a boolean that is true if the 182 | word is emphasized. 183 | 184 | 185 | #### `emphases.from_files_to_files` 186 | 187 | ```python 188 | def from_files_to_files( 189 | text_files: List[Union[str, bytes, os.PathLike]], 190 | audio_files: List[Union[str, bytes, os.PathLike]], 191 | output_prefixes: Optional[List[Union[str, bytes, os.PathLike]]] = None, 192 | checkpoint: Optional[Union[str, bytes, os.PathLike]] = None, 193 | batch_size: Optional[int] = None, 194 | gpu: Optional[int] = None 195 | ) -> None: 196 | """Produce emphasis scores for each word for many files and save to disk 197 | 198 | Args: 199 | text_file: The speech transcript (.txt) or alignment (.TextGrid) files 200 | audio_files: The corresponding speech audio files 201 | output_prefixes: The output files. Defaults to text file stems. 202 | checkpoint: The model checkpoint to use for inference 203 | batch_size: The maximum number of frames per batch 204 | gpu: The index of the gpu to run inference on 205 | """ 206 | ``` 207 | 208 | 209 | ### Command-line interface 210 | 211 | ``` 212 | python -m emphases 213 | [-h] 214 | --text_files TEXT_FILES [TEXT_FILES ...] 215 | --audio_files AUDIO_FILES [AUDIO_FILES ...] 216 | [--output_files OUTPUT_FILES [OUTPUT_FILES ...]] 217 | [--checkpoint CHECKPOINT] 218 | [--batch_size BATCH_SIZE] 219 | [--gpu GPU] 220 | 221 | Determine which words in a speech file are emphasized 222 | 223 | options: 224 | -h, --help show this help message and exit 225 | --text_files TEXT_FILES [TEXT_FILES ...] 226 | The speech transcript text files 227 | --audio_files AUDIO_FILES [AUDIO_FILES ...] 228 | The corresponding speech audio files 229 | --output_files OUTPUT_FILES [OUTPUT_FILES ...] 230 | The output files. Default is text files with json suffix. 231 | --checkpoint CHECKPOINT 232 | The model checkpoint to use for inference 233 | --batch_size BATCH_SIZE 234 | The maximum number of frames per batch 235 | --gpu GPU The index of the gpu to run inference on 236 | ``` 237 | 238 | 239 | ## Training 240 | 241 | ### Download data 242 | 243 | `python -m emphases.download --datasets `. 244 | 245 | Downloads and uncompresses datasets. 246 | 247 | **N.B.** We omit Buckeye for public release. This evaluation dataset can be 248 | made by [downloading Buckeye](https://buckeyecorpus.osu.edu/) and matching 249 | the files to the 250 | [annotations](https://github.com/ProSD-Lab/Prominence-perception-in-English-French-Spanish/). 251 | The process of matching the files to the annotations was done for us and is 252 | tricky to replicate exactly. However, due to licensing restrictions on 253 | Buckeye, we cannot legally distribute our private, aligned annotations. 254 | 255 | 256 | ### Annotate data 257 | 258 | Performing annotation requires first installing 259 | [Reproducible Subjective Evaluation (ReSEval)](https://github.com/reseval/reseval). 260 | 261 | `python -m emphases.annotate --datasets ` 262 | 263 | Launches a local web application to perform emphasis annotation, according to 264 | the ReSEval configuration file `emphases/assets/configs/annotate.yaml`. 265 | Requires ReSEval to be installed. 266 | 267 | `python -m emphases.annotate --datasets --remote --production` 268 | 269 | Launches a crowdsourced emphasis annotation task, according to the ReSEval 270 | configuration file `emphases/assets/configs/annotate.yaml`. Requires ReSEval 271 | to be installed. 272 | 273 | 274 | ### Partition data 275 | 276 | `python -m emphases.partition` 277 | 278 | Generates `train`, `valid`, and `test` partitions for all datasets. 279 | Partitioning is deterministic given the same random seed. You do not need to 280 | run this step, as the original partitions are saved in 281 | `emphases/assets/partitions`. 282 | 283 | 284 | ### Preprocess 285 | 286 | `python -m emphases.preprocess` 287 | 288 | 289 | ### Train 290 | 291 | `python -m emphases.train --config --dataset --gpus ` 292 | 293 | Trains a model according to a given configuration. Uses a list of GPU 294 | indices as an argument, and uses distributed data parallelism (DDP) 295 | if more than one index is given. For example, `--gpus 0 3` will train 296 | using DDP on GPUs `0` and `3`. 297 | 298 | 299 | ## Evaluation 300 | 301 | ### Evaluate 302 | 303 | `python -m emphases.evaluate --config --checkpoint --gpu ` 304 | 305 | 306 | ### Monitor 307 | 308 | Run `tensorboard --logdir runs/`. If you are running training 309 | remotely, you must create a SSH connection with port forwarding to view 310 | Tensorboard. This can be done with `ssh -L 6006:localhost:6006 311 | @`. Then, open `localhost:6006` in your browser. 312 | 313 | 314 | ## Citation 315 | 316 | ### IEEE 317 | M. Morrison, P. Pawar, N. Pruyne, J. Cole, and B. Pardo, "Crowdsourced and Automatic Speech Prominence Estimation," International Conference on Acoustics, Speech, & Signal Processing, 2024. 318 | 319 | 320 | ### BibTex 321 | 322 | ``` 323 | @inproceedings{morrison2024crowdsourced, 324 | title={Crowdsourced and Automatic Speech Prominence Estimation}, 325 | author={Morrison, Max and Pawar, Pranav and Pruyne, Nathan and Cole, Jennifer and Pardo, Bryan}, 326 | booktitle={International Conference on Acoustics, Speech, & Signal Processing}, 327 | year={2024} 328 | } 329 | -------------------------------------------------------------------------------- /config/base.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'base' 5 | -------------------------------------------------------------------------------- /config/baselines/duration-variance.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'duration-variance' 5 | 6 | # Method to use for inference 7 | METHOD = 'duration-variance' 8 | -------------------------------------------------------------------------------- /config/baselines/pitch-variance.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'pitch-variance' 5 | 6 | # Method to use for inference 7 | METHOD = 'pitch-variance' 8 | -------------------------------------------------------------------------------- /config/baselines/prominence.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'prominence' 5 | 6 | # Method to use for inference 7 | METHOD = 'prominence' 8 | -------------------------------------------------------------------------------- /config/downsample/average-inference.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'average-inference' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'inference' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'average' 13 | -------------------------------------------------------------------------------- /config/downsample/average-input.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'average-input' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'input' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'average' 13 | -------------------------------------------------------------------------------- /config/downsample/average-intermediate.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'average-intermediate' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'intermediate' 9 | 10 | # Location to perform resampling from frame resolution to word resolution. 11 | # One of ['inference', 'input', 'intermediate', 'loss']. 12 | DOWNSAMPLE_LOCATION = 'input' 13 | -------------------------------------------------------------------------------- /config/downsample/average-loss.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'average-loss' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'loss' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max']. 12 | DOWNSAMPLE_METHOD = 'average' 13 | -------------------------------------------------------------------------------- /config/downsample/center-inference.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'center-inference' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'inference' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'center' 13 | -------------------------------------------------------------------------------- /config/downsample/center-input.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'center-input' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'input' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'center' 13 | -------------------------------------------------------------------------------- /config/downsample/center-intermediate.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'center-intermediate' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'intermediate' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'center' 13 | -------------------------------------------------------------------------------- /config/downsample/center-loss.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'center-loss' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'loss' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'center' 13 | -------------------------------------------------------------------------------- /config/downsample/max-inference.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'max-inference' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'inference' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'max' 13 | -------------------------------------------------------------------------------- /config/downsample/max-input.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'max-input' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'input' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'max' 13 | -------------------------------------------------------------------------------- /config/downsample/max-intermediate.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'max-intermediate' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'intermediate' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'max' 13 | -------------------------------------------------------------------------------- /config/downsample/max-loss.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'max-loss' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'loss' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'max' 13 | -------------------------------------------------------------------------------- /config/downsample/sum-inference.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'sum-inference' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'inference' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'sum' 13 | -------------------------------------------------------------------------------- /config/downsample/sum-input.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'sum-input' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'input' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'sum' 13 | -------------------------------------------------------------------------------- /config/downsample/sum-intermediate.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'sum-intermediate' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'intermediate' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'sum' 13 | -------------------------------------------------------------------------------- /config/downsample/sum-loss.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'sum-loss' 5 | 6 | # Location to perform resampling from frame resolution to word resolution. 7 | # One of ['inference', 'input', 'intermediate', 'loss']. 8 | DOWNSAMPLE_LOCATION = 'loss' 9 | 10 | # Method to use for resampling from frame resolution to word resolution. 11 | # One of ['average', 'center', 'max', 'sum']. 12 | DOWNSAMPLE_METHOD = 'sum' 13 | -------------------------------------------------------------------------------- /config/hparam-search/batch-060000.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'batch-060000' 5 | 6 | # Maximum number of frames in one batch 7 | MAX_TRAINING_FRAMES = 60000 8 | -------------------------------------------------------------------------------- /config/hparam-search/batch-070000.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'batch-070000' 5 | 6 | # Maximum number of frames in one batch 7 | MAX_TRAINING_FRAMES = 70000 8 | -------------------------------------------------------------------------------- /config/hparam-search/batch-075000.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'batch-075000' 5 | 6 | # Maximum number of frames in one batch 7 | MAX_TRAINING_FRAMES = 75000 8 | -------------------------------------------------------------------------------- /config/hparam-search/batch-100000.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'batch-100000' 5 | 6 | # Maximum number of frames in one batch 7 | MAX_TRAINING_FRAMES = 100000 8 | -------------------------------------------------------------------------------- /config/hparam-search/buckets-1.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'buckets-1' 5 | 6 | # Number of buckets of data lengths used by the sampler 7 | BUCKETS = 1 8 | -------------------------------------------------------------------------------- /config/hparam-search/buckets-2.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'buckets-2' 5 | 6 | # Number of buckets of data lengths used by the sampler 7 | BUCKETS = 2 8 | -------------------------------------------------------------------------------- /config/hparam-search/convolution-5-80.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'convolution-5-80' 5 | 6 | # Model width 7 | CHANNELS = 80 8 | 9 | # Number of network layers 10 | LAYERS = 5 11 | -------------------------------------------------------------------------------- /config/hparam-search/convolution-6-128.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'convolution-6-128' 5 | 6 | # Model width 7 | CHANNELS = 128 8 | 9 | # Number of network layers 10 | LAYERS = 6 11 | -------------------------------------------------------------------------------- /config/hparam-search/convolution-6-64.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'convolution-6-64' 5 | 6 | # Model width 7 | CHANNELS = 64 8 | 9 | # Number of network layers 10 | LAYERS = 6 11 | -------------------------------------------------------------------------------- /config/hparam-search/convolution-6-80.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'convolution-6-80' 5 | 6 | # Model width 7 | CHANNELS = 80 8 | 9 | # Number of network layers 10 | LAYERS = 6 11 | -------------------------------------------------------------------------------- /config/hparam-search/convolution-7-80.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'convolution-7-80' 5 | 6 | # Model width 7 | CHANNELS = 80 8 | 9 | # Number of network layers 10 | LAYERS = 7 11 | -------------------------------------------------------------------------------- /config/hparam-search/decoder-kernel-1.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'decoder-kernel-1' 5 | 6 | # Decoder convolution kernel size 7 | DECODER_KERNEL_SIZE = 1 8 | -------------------------------------------------------------------------------- /config/hparam-search/decoder-kernel-5.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'decoder-kernel-5' 5 | 6 | # Decoder convolution kernel size 7 | DECODER_KERNEL_SIZE = 5 8 | -------------------------------------------------------------------------------- /config/hparam-search/dropout-05.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'dropout-05' 5 | 6 | # Dropout probability (or None to not use dropout) 7 | DROPOUT = .05 8 | -------------------------------------------------------------------------------- /config/hparam-search/dropout-10.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'dropout-10' 5 | 6 | # Dropout probability (or None to not use dropout) 7 | DROPOUT = .1 8 | -------------------------------------------------------------------------------- /config/hparam-search/encoder-kernel-5.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'encoder-kernel-5' 5 | 6 | # Encoder convolution kernel size 7 | ENCODER_KERNEL_SIZE = 5 8 | -------------------------------------------------------------------------------- /config/hparam-search/encoder-kernel-7.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'encoder-kernel-7' 5 | 6 | # Encoder convolution kernel size 7 | ENCODER_KERNEL_SIZE = 7 8 | -------------------------------------------------------------------------------- /config/hparam-search/gelu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | MODULE = 'emphases' 4 | 5 | # Configuration name 6 | CONFIG = 'gelu' 7 | 8 | # Activation function to use in convolution model 9 | ACTIVATION_FUNCTION = torch.nn.GELU 10 | -------------------------------------------------------------------------------- /config/hparam-search/leaky-relu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | MODULE = 'emphases' 4 | 5 | # Configuration name 6 | CONFIG = 'leaky-relu' 7 | 8 | # Activation function to use in convolution model 9 | ACTIVATION_FUNCTION = torch.nn.LeakyReLU 10 | -------------------------------------------------------------------------------- /config/hparam-search/mse.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'mse' 5 | 6 | # Loss function. One of ['bce', 'mse'] 7 | LOSS = 'mse' 8 | -------------------------------------------------------------------------------- /config/hparam-search/silu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | MODULE = 'emphases' 4 | 5 | # Configuration name 6 | CONFIG = 'silu' 7 | 8 | # Activation function to use in convolution model 9 | ACTIVATION_FUNCTION = torch.nn.SiLU 10 | -------------------------------------------------------------------------------- /config/scaling/16-2.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '16-2' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of allowed annotations 10 | MAX_ANNOTATIONS = 2 11 | 12 | # Maximum number of training utterances 13 | MAX_TRAINING_UTTERANCES = 1600 14 | 15 | # Minimum number of allowed annotations 16 | MIN_ANNOTATIONS = 2 17 | 18 | # Whether to use the specified one-eighth dataset for scaling law experiments 19 | ONE_EIGHTH_UTTERANCES = True 20 | 21 | # Dataset to use for validation 22 | VALIDATION_DATASET = 'buckeye' 23 | -------------------------------------------------------------------------------- /config/scaling/1600.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '1600' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of training utterances 10 | MAX_TRAINING_UTTERANCES = 1600 11 | 12 | # Whether to use the specified one-eighth dataset for scaling law experiments 13 | ONE_EIGHTH_UTTERANCES = True 14 | 15 | # Dataset to use for validation 16 | VALIDATION_DATASET = 'buckeye' 17 | -------------------------------------------------------------------------------- /config/scaling/32-4.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '32-4' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of allowed annotations 10 | MAX_ANNOTATIONS = 4 11 | 12 | # Maximum number of training utterances 13 | MAX_TRAINING_UTTERANCES = 800 14 | 15 | # Minimum number of allowed annotations 16 | MIN_ANNOTATIONS = 4 17 | 18 | # Whether to use the specified one-eighth dataset for scaling law experiments 19 | ONE_EIGHTH_UTTERANCES = True 20 | 21 | # Dataset to use for validation 22 | VALIDATION_DATASET = 'buckeye' 23 | -------------------------------------------------------------------------------- /config/scaling/3200.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '3200' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of training utterances 10 | MAX_TRAINING_UTTERANCES = 3200 11 | 12 | # Whether to use the specified one-eighth dataset for scaling law experiments 13 | ONE_EIGHTH_UTTERANCES = True 14 | 15 | # Dataset to use for validation 16 | VALIDATION_DATASET = 'buckeye' 17 | -------------------------------------------------------------------------------- /config/scaling/400.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '400' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of training utterances 10 | MAX_TRAINING_UTTERANCES = 400 11 | 12 | # Whether to use the specified one-eighth dataset for scaling law experiments 13 | ONE_EIGHTH_UTTERANCES = True 14 | 15 | # Dataset to use for validation 16 | VALIDATION_DATASET = 'buckeye' 17 | -------------------------------------------------------------------------------- /config/scaling/64-8.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '64-8' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of allowed annotations 10 | MAX_ANNOTATIONS = 8 11 | 12 | # Maximum number of training utterances 13 | MAX_TRAINING_UTTERANCES = 400 14 | 15 | # Minimum number of allowed annotations 16 | MIN_ANNOTATIONS = 8 17 | 18 | # Whether to use the specified one-eighth dataset for scaling law experiments 19 | ONE_EIGHTH_UTTERANCES = True 20 | 21 | # Dataset to use for validation 22 | VALIDATION_DATASET = 'buckeye' 23 | -------------------------------------------------------------------------------- /config/scaling/8-1.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '8-1' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of allowed annotations 10 | MAX_ANNOTATIONS = 1 11 | 12 | # Maximum number of training utterances 13 | MAX_TRAINING_UTTERANCES = 3200 14 | 15 | # Minimum number of allowed annotations 16 | MIN_ANNOTATIONS = 1 17 | 18 | # Whether to use the specified one-eighth dataset for scaling law experiments 19 | ONE_EIGHTH_UTTERANCES = True 20 | 21 | # Dataset to use for validation 22 | VALIDATION_DATASET = 'buckeye' 23 | -------------------------------------------------------------------------------- /config/scaling/800.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = '800' 5 | 6 | # Datasets to use for evaluation 7 | EVALUATION_DATASETS = ['buckeye'] 8 | 9 | # Maximum number of training utterances 10 | MAX_TRAINING_UTTERANCES = 800 11 | 12 | # Whether to use the specified one-eighth dataset for scaling law experiments 13 | ONE_EIGHTH_UTTERANCES = True 14 | 15 | # Dataset to use for validation 16 | VALIDATION_DATASET = 'buckeye' 17 | -------------------------------------------------------------------------------- /config/scaling/base-automatic.py: -------------------------------------------------------------------------------- 1 | MODULE = 'emphases' 2 | 3 | # Configuration name 4 | CONFIG = 'base-automatic' 5 | -------------------------------------------------------------------------------- /data/cache/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/data/cache/.gitkeep -------------------------------------------------------------------------------- /data/datasets/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/data/datasets/.gitkeep -------------------------------------------------------------------------------- /data/sources/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/data/sources/.gitkeep -------------------------------------------------------------------------------- /emphases/__init__.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Configuration 3 | ############################################################################### 4 | 5 | 6 | # Default configuration parameters to be modified 7 | from .config import defaults 8 | 9 | # Modify configuration 10 | import yapecs 11 | yapecs.configure('emphases', defaults) 12 | 13 | # Import configuration parameters 14 | from .config.defaults import * 15 | from .config.static import * 16 | 17 | 18 | ############################################################################### 19 | # Module imports 20 | ############################################################################### 21 | 22 | 23 | from .core import * 24 | from .model import Model 25 | from . train import loss, train 26 | from . import annotate 27 | from . import baselines 28 | from . import convert 29 | from . import data 30 | from . import evaluate 31 | from . import load 32 | from . import partition 33 | from . import plot 34 | -------------------------------------------------------------------------------- /emphases/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import emphases 5 | 6 | 7 | ############################################################################### 8 | # Determine which words in a speech file are emphasized 9 | ############################################################################### 10 | 11 | 12 | def parse_args(): 13 | """Parse command-line arguments""" 14 | parser = argparse.ArgumentParser( 15 | description='Determine which words in a speech file are emphasized') 16 | parser.add_argument( 17 | '--text_files', 18 | type=Path, 19 | nargs='+', 20 | required=True, 21 | help='The speech transcript (.txt) or alignment (.TextGrid) files') 22 | parser.add_argument( 23 | '--audio_files', 24 | type=Path, 25 | nargs='+', 26 | required=True, 27 | help='The corresponding speech audio files') 28 | parser.add_argument( 29 | '--output_prefixes', 30 | type=Path, 31 | nargs='+', 32 | required=False, 33 | help='output_prefixes: The output files. Defaults to text files stems.') 34 | parser.add_argument( 35 | '--checkpoint', 36 | type=Path, 37 | help='The model checkpoint to use for inference') 38 | parser.add_argument( 39 | '--batch_size', 40 | type=int, 41 | help='The maximum number of frames per batch') 42 | parser.add_argument( 43 | '--gpu', 44 | type=int, 45 | help='The index of the gpu to run inference on') 46 | return parser.parse_args() 47 | 48 | 49 | if __name__ == '__main__': 50 | emphases.from_files_to_files(**vars(parse_args())) 51 | -------------------------------------------------------------------------------- /emphases/annotate/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | -------------------------------------------------------------------------------- /emphases/annotate/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import emphases 5 | 6 | 7 | ############################################################################### 8 | # Annotate emphases 9 | ############################################################################### 10 | 11 | 12 | def parse_args(): 13 | """Parse command-line arguments""" 14 | parser = argparse.ArgumentParser(description='Perform emphasis annotation') 15 | parser.add_argument( 16 | '--annotation_config', 17 | type=Path, 18 | default=emphases.DEFAULT_ANNOTATION_CONFIG, 19 | help='The ReSEval configuration file for the annotation task') 20 | parser.add_argument( 21 | '--dataset', 22 | default='libritts', 23 | help='The dataset to annotate') 24 | parser.add_argument( 25 | '--directory', 26 | type=Path, 27 | default=emphases.ANNOTATION_DIR, 28 | help='The directory to save results to') 29 | parser.add_argument( 30 | '--remote', 31 | action='store_true', 32 | help='Run subjective evaluation remotely') 33 | parser.add_argument( 34 | '--production', 35 | action='store_true', 36 | help='Deploy the subjective evaluation to crowdsource participants') 37 | parser.add_argument( 38 | '--interval', 39 | type=int, 40 | default=120, 41 | help='The time between monitoring updates in seconds') 42 | return parser.parse_args() 43 | 44 | 45 | if __name__ == '__main__': 46 | emphases.annotate.datasets(**vars(parse_args())) 47 | -------------------------------------------------------------------------------- /emphases/annotate/core.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import shutil 4 | 5 | import pypar 6 | import reseval 7 | 8 | import emphases 9 | 10 | 11 | ############################################################################### 12 | # Annotate emphases 13 | ############################################################################### 14 | 15 | 16 | def datasets( 17 | annotation_config=emphases.DEFAULT_ANNOTATION_CONFIG, 18 | dataset='libritts', 19 | directory=emphases.ANNOTATION_DIR, 20 | remote=False, 21 | production=False, 22 | interval=120): 23 | """Perform emphasis annotation on datasets""" 24 | # Create input and output directories 25 | directory.mkdir(exist_ok=True, parents=True) 26 | index = f'{len(list(directory.glob("*"))):02}' 27 | input_directory = directory / index 28 | input_directory.mkdir(exist_ok=True, parents=True) 29 | output_directory = emphases.DATA_DIR / 'crowdsource' / index 30 | output_directory.mkdir(exist_ok=True, parents=True) 31 | 32 | # Get audio files 33 | cache_directory = emphases.CACHE_DIR / dataset 34 | audio_files = sorted(list(cache_directory.rglob('*.wav'))) 35 | 36 | # Deterministic shuffle 37 | random.seed(emphases.RANDOM_SEED) 38 | random.shuffle(audio_files) 39 | 40 | # Iterate over audio files 41 | for audio_file in audio_files: 42 | 43 | # Save audio 44 | shutil.copyfile(audio_file, input_directory / audio_file.name) 45 | 46 | # Load alignment 47 | alignment = pypar.Alignment( 48 | cache_directory / 49 | 'alignment' / 50 | f'{audio_file.stem}.TextGrid') 51 | 52 | # Save text 53 | text_file = input_directory / f'{audio_file.stem}-words.txt' 54 | with open(text_file, 'w') as file: 55 | file.write( 56 | ' '.join([ 57 | str(word) for word in alignment 58 | if str(word) != pypar.SILENCE])) 59 | 60 | # Run annotation 61 | reseval.run( 62 | annotation_config, 63 | input_directory, 64 | output_directory, 65 | not remote, 66 | production, 67 | interval) 68 | -------------------------------------------------------------------------------- /emphases/assets/checkpoints/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/assets/checkpoints/.gitkeep -------------------------------------------------------------------------------- /emphases/assets/checkpoints/checkpoint.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/assets/checkpoints/checkpoint.pt -------------------------------------------------------------------------------- /emphases/assets/configs/annotate.yaml: -------------------------------------------------------------------------------- 1 | # A name to give to this evaluation configuration 2 | name: emphasis-annotation 3 | 4 | # The type of test to run. One of [ab, abx, mos, mushra, wordselect]. 5 | test: wordselect 6 | 7 | # The type of data to use. One of [audio, image, text, video]. 8 | datatype: audio 9 | 10 | # The location to store files used for evaluation. One of [aws]. 11 | storage: aws 12 | 13 | # The third-party platform hosting the MySQL database. One of [aws, heroku]. 14 | database: aws 15 | 16 | # The third-party platform hosting the server. One of [aws, heroku]. 17 | server: aws 18 | 19 | # Crowdsourcing configuration 20 | crowdsource: 21 | 22 | # The crowdsourcing platform used for evaluation. One of [mturk]. 23 | platform: mturk 24 | 25 | # The survey title shown to potential participants 26 | title: Emphasis annotation 27 | 28 | # The survey description shown to potential participants 29 | description: "Participate in a research study by listening to English speech and selecting emphasized words. Requires headphones and a quiet listening environment to pass listening test prescreening. $2.89 bonus on completion. Estimated 15 minutes ($13.35 / hour)." 30 | 31 | # Keywords that participants can use to find your survey 32 | keywords: annotate, audio, emphasis, headphones, speech 33 | 34 | # Filter participants 35 | filter: 36 | 37 | # Only allow participants from a certain countries 38 | countries: ['US'] 39 | 40 | # Only allow participants who have previously completed at least this 41 | # number of tasks 42 | approved_tasks: 1000 43 | 44 | # Only allow participants who have a sufficiently high acceptance rating 45 | approval_rating: 99 46 | 47 | # How much you pay participants (in US dollars) 48 | # E.g., 2.00 is two dollars; 0.50 is fifty cents 49 | payment: 50 | 51 | # The amount that you pay even if they don't pass prescreening 52 | base: 0.45 53 | 54 | # The additional amount that you pay participants who complete evaluation 55 | completion: 2.89 56 | 57 | # How long to wait for things (in seconds) 58 | duration: 59 | 60 | # Total lifespan of the evaluation, after which the evaluation is no 61 | # longer available for participants to take 62 | total: 604800 63 | 64 | # The maximum time you will allow a participant to spend on your task 65 | assignment: 5400 66 | 67 | # Duration after which payment is automatically made 68 | autoapprove: 172800 69 | 70 | # The number of participants 71 | participants: 10 72 | 73 | # The number of evaluations each participant performs 74 | samples_per_participant: 20 75 | 76 | # A seed to use for deterministic random sampling 77 | random_seed: 0 78 | 79 | # Introduction text to display on the first page participants visit 80 | # N.B. This is not the actual IRB-approved survey text used in our studies, 81 | # as we do not want others claiming to be part of our IRB-approved study. 82 | welcome_text: " 83 | # **Welcome!**\n 84 | We are conducting a research study to evaluate the 85 | quality of an audio processing algorithm. If you agree to participate, you 86 | will be asked to fill out a brief questionnaire. You will then be asked to 87 | evaluate a series of audio samples.\n 88 | ### **Privacy**\nThis survey is completely anonymous. We will NOT collect 89 | any personally identifiable information. Your participation in this study 90 | does not involve any risk to you beyond that of your everyday life.\n 91 | ### **Consent**\nBy pressing **I Agree**, you confirm you are willing 92 | to participate in this research. However, you are free to withdraw your 93 | participation at any time.\n 94 | ### **Contact Information**\nIf you have any questions or feedback, 95 | please contact ." 96 | 97 | # Questions that participants must answer before they are permitted to 98 | # perform evaluation. If a multiple choice question has correct_answer 99 | # defined, the participant must select that answer to be able to continue 100 | # to the evaluation. 101 | prescreen_questions: [] 102 | 103 | # Include an audio listening test 104 | listening_test: 105 | 106 | # Listening test instructions 107 | instructions: " 108 | ## **Instructions** \nMake sure your headphones are on and your volume 109 | is turned up to a comfortable level. Listen to the audio. Then, select 110 | how many tones you heard." 111 | 112 | # Number of questions to include on the listening test 113 | num_questions: 2 114 | 115 | # Number of allowed retries before the participant fails the test 116 | retries: 2 117 | 118 | # Instructions presented to the participant during evaluation 119 | survey_instructions: " 120 | 121 | ## **Instructions** \nListen to the audio file a minimum of two times. 122 | Select the words that were emphasized by the speaker. The emphasized 123 | words are those that stand out from nearby words. Play the audio and then 124 | click on a word to select (boldface) or deselect it." 125 | 126 | # Questions presented to the participant after evaluation 127 | followup_questions: 128 | 129 | # Ask participant for their native language 130 | - name: Language 131 | 132 | # The type of question. One of [free-response, multiple-choice]. 133 | type: multiple-choice 134 | 135 | # Question text 136 | text: What is your native language? 137 | 138 | # Possible answers 139 | answers: [ 140 | 'Albanian', 141 | 'Amharic', 142 | 'Arabic', 143 | 'Bengali', 144 | 'Berber', 145 | 'Creole', 146 | 'Dari', 147 | 'Dzongkha', 148 | 'English', 149 | 'Farsi', 150 | 'Filipino', 151 | 'French', 152 | 'German', 153 | 'Gujarati', 154 | 'Hakka', 155 | 'Hausa', 156 | 'Hebrew', 157 | 'Hindi', 158 | 'Hokkien', 159 | 'Indonesian', 160 | 'Italian', 161 | 'Japanese', 162 | 'Javanese', 163 | 'Kannada', 164 | 'Korean', 165 | 'Mandarin Chinese', 166 | 'Marathi', 167 | 'Nepali', 168 | 'Nigerian Pidgin', 169 | 'Oromo', 170 | 'Pashto', 171 | 'Patois', 172 | 'Polish', 173 | 'Portuguese', 174 | 'Russian', 175 | 'Spanish', 176 | 'Swahili', 177 | 'Somali', 178 | 'Tagalog', 179 | 'Tamil', 180 | 'Telugu', 181 | 'Thai', 182 | 'Turkish', 183 | 'Ukrainian', 184 | 'Urdu', 185 | 'Uzbek', 186 | 'Vietnamese', 187 | 'Western Punjabi', 188 | 'Wu Chinese', 189 | 'Yue Chinese', 190 | 'Other'] 191 | 192 | # Ask participant for their country of origin 193 | - name: Country 194 | 195 | # The type of question. One of [free-response, multiple-choice]. 196 | type: multiple-choice 197 | 198 | # Question text 199 | text: What country/region did you live in during your childhood? 200 | 201 | # Possible answers 202 | answers: [ 203 | 'Afghanistan', 204 | 'Albania', 205 | 'Argentina', 206 | 'Bangladesh', 207 | 'Bhutan', 208 | 'Brazil', 209 | 'Cameroon', 210 | 'Canada', 211 | 'China', 212 | 'Colombia', 213 | 'Cuba', 214 | 'Dominican Republic', 215 | 'Ecuador', 216 | 'Egypt', 217 | 'El Salvador', 218 | 'Ethiopia', 219 | 'France', 220 | 'Germany', 221 | 'Ghana', 222 | 'Guatemala', 223 | 'Guyana', 224 | 'Haiti', 225 | 'Honduras', 226 | 'India', 227 | 'Iran', 228 | 'Iraq', 229 | 'Israel', 230 | 'Jamaica', 231 | 'Japan', 232 | 'Jordan', 233 | 'Kenya', 234 | 'Mexico', 235 | 'Morocco', 236 | 'Nepal', 237 | 'Nicaragua', 238 | 'Nigeria', 239 | 'Pakistan', 240 | 'Peru', 241 | 'Philippines', 242 | 'Poland', 243 | 'Russia', 244 | 'Somalia', 245 | 'South Korea', 246 | 'Syria', 247 | 'Taiwan', 248 | 'Thailand', 249 | 'Turkey', 250 | 'Ukraine', 251 | 'United Kingdom', 252 | 'United States', 253 | 'Uzbekistan', 254 | 'Venezuela', 255 | 'Vietnam', 256 | 'Yemen', 257 | 'Other'] 258 | -------------------------------------------------------------------------------- /emphases/assets/partitions/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/assets/partitions/.gitkeep -------------------------------------------------------------------------------- /emphases/assets/partitions/buckeye.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": [], 3 | "valid": [], 4 | "test": [ 5 | "s25-1", 6 | "s04-1", 7 | "s16-1", 8 | "s26-1", 9 | "s02-1", 10 | "s03-1", 11 | "s22-1", 12 | "s32-1", 13 | "s21-1", 14 | "s24-1", 15 | "s17-1", 16 | "s14-1", 17 | "s11-1" 18 | ] 19 | } -------------------------------------------------------------------------------- /emphases/baselines/__init__.py: -------------------------------------------------------------------------------- 1 | from . import prominence 2 | from . import duration_variance 3 | from . import pitch_variance 4 | -------------------------------------------------------------------------------- /emphases/baselines/duration_variance/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | -------------------------------------------------------------------------------- /emphases/baselines/duration_variance/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | ############################################################################### 5 | # Duration variance baseline method 6 | ############################################################################### 7 | 8 | 9 | def infer(alignment): 10 | """Compute per-word emphasis scores using duration variance method""" 11 | # Average duration of phonemes in the sentence 12 | average_duration = alignment.duration() / len(alignment.phonemes()) 13 | 14 | # Average duration of phonemes in each word 15 | average_duration_per_word = torch.tensor([ 16 | word.duration() / len(word) for word in alignment]) 17 | 18 | # Zero-center 19 | return (average_duration_per_word - average_duration)[None] 20 | -------------------------------------------------------------------------------- /emphases/baselines/pitch_variance/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | -------------------------------------------------------------------------------- /emphases/baselines/pitch_variance/core.py: -------------------------------------------------------------------------------- 1 | import penn 2 | import torch 3 | 4 | import emphases 5 | 6 | 7 | ############################################################################### 8 | # Pitch variance method 9 | ############################################################################### 10 | 11 | 12 | def infer(alignment, audio, sample_rate, gpu=None): 13 | """Compute per-word emphasis scores using pitch variance method""" 14 | # Infer pitch and periodicity 15 | pitch, _ = penn.from_audio( 16 | audio, 17 | sample_rate, 18 | hopsize=emphases.HOPSIZE_SECONDS, 19 | fmin=emphases.FMIN, 20 | fmax=emphases.FMAX, 21 | pad=True, 22 | interp_unvoiced_at=emphases.VOICED_THRESHOLD, 23 | gpu=gpu) 24 | 25 | # Compute pitch statistics in base-two log-space 26 | pitch = torch.log2(pitch) 27 | 28 | # Compute utterance statistics 29 | utterance_spread = spread(pitch) 30 | 31 | # Compute word statistics 32 | word_spreads = [] 33 | for word in alignment: 34 | start = int(emphases.convert.seconds_to_frames(word.start())) 35 | end = int(emphases.convert.seconds_to_frames(word.end())) 36 | word_spreads.append(spread(pitch[0, start:end])) 37 | word_spreads = torch.tensor( 38 | word_spreads, 39 | dtype=pitch.dtype, 40 | device=pitch.device)[None] 41 | 42 | # Zero-center 43 | return word_spreads - utterance_spread 44 | 45 | 46 | ############################################################################### 47 | # Utilities 48 | ############################################################################### 49 | 50 | 51 | def spread(pitch): 52 | """Compute pitch spread""" 53 | return torch.quantile(pitch, .95) - torch.quantile(pitch, .05) 54 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from . import cwt_utils 3 | from . import duration_processing 4 | from . import energy_processing 5 | from . import f0_processing 6 | from . import filter 7 | from . import loma 8 | from . import pitch_tracker 9 | from . import smooth_and_interp 10 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/core.py: -------------------------------------------------------------------------------- 1 | import fractions 2 | 3 | import torch 4 | import numpy as np 5 | from scipy.signal import resample_poly 6 | 7 | import emphases 8 | 9 | 10 | ############################################################################### 11 | # Prominence API 12 | ############################################################################### 13 | 14 | 15 | def infer(alignment, audio, sample_rate): 16 | """Compute per-word prominence from alignment and audio""" 17 | # Convert to numpy 18 | audio = audio.numpy()[0] 19 | 20 | # Compute energy 21 | energy = emphases.baselines.prominence.energy_processing.extract_energy( 22 | audio, 23 | sample_rate) 24 | energy = np.cbrt(energy + 1) 25 | 26 | # Smooth energy 27 | energy = emphases.baselines.prominence.smooth_and_interp.peak_smooth( 28 | energy, 29 | 30, 30 | 3) 31 | energy = emphases.baselines.prominence.smooth_and_interp.smooth(energy, 10) 32 | 33 | # Compute pitch 34 | pitch = emphases.baselines.prominence.pitch_tracker.inst_freq_pitch( 35 | audio, 36 | sample_rate) 37 | pitch = emphases.baselines.prominence.f0_processing.process(pitch) 38 | 39 | # Extract duration 40 | duration = \ 41 | emphases.baselines.prominence.duration_processing.get_duration_signal( 42 | alignment, 43 | weights=[.5, .5], 44 | rate=200) 45 | 46 | # Slice features 47 | min_length = np.min([len(pitch), len(energy), len(duration)]) 48 | pitch = pitch[:min_length] 49 | energy = energy[:min_length] 50 | duration = duration[:min_length] 51 | 52 | # Combine features 53 | combined = ( 54 | emphases.PROMINENCE_PITCH_WEIGHT * normalize(pitch) + 55 | emphases.PROMINENCE_ENERGY_WEIGHT * normalize(energy) + 56 | emphases.PROMINENCE_DURATION_WEIGHT * normalize(duration)) 57 | combined = normalize( 58 | emphases.baselines.prominence.smooth_and_interp.remove_bias( 59 | combined, 60 | 800)) 61 | 62 | # Distance between adjacent scales (.25 means 4 scales per octave) 63 | scale_distance = .25 # octaves 64 | 65 | # Continuous wavelet transform analysis 66 | cwt, scales, freqs = emphases.baselines.prominence.cwt_utils.cwt_analysis( 67 | combined, 68 | mother_name='mexican_hat', 69 | period=3, 70 | num_scales=34, 71 | scale_distance=scale_distance, 72 | apply_coi=False) 73 | cwt = np.real(cwt) 74 | scales *= 200 75 | 76 | # Get scale that minimizes distance with average word length 77 | average_duration = (alignment.end() / len(alignment))*200 78 | scales = 1. / freqs * 200 * .5 79 | scale = np.argmin(np.abs(scales - average_duration)) 80 | 81 | # Define the scale information 82 | pos_loma_start = scale + \ 83 | int(emphases.LOMA_PROMINENCE_START / scale_distance) 84 | pos_loma_end = scale + \ 85 | int(emphases.LOMA_PROMINENCE_END / scale_distance) 86 | neg_loma_start = scale + \ 87 | int(emphases.LOMA_BOUNDARY_START / scale_distance) 88 | neg_loma_end = scale + \ 89 | int(emphases.LOMA_BOUNDARY_END / scale_distance) 90 | 91 | # Retrieve line of maximum amplitude 92 | pos_loma = emphases.baselines.prominence.loma.get_loma( 93 | cwt, 94 | scales, 95 | pos_loma_start, 96 | pos_loma_end) 97 | neg_loma = emphases.baselines.prominence.loma.get_loma( 98 | -cwt, 99 | scales, 100 | neg_loma_start, 101 | neg_loma_end) 102 | 103 | # Decode prominence 104 | max_loma = np.array(emphases.baselines.prominence.loma.get_prominences( 105 | pos_loma, 106 | alignment, 107 | rate=200)) 108 | 109 | # Prominence dimensions - [time, value] 110 | prominences = torch.tensor(max_loma) 111 | 112 | # Decode boundaries 113 | # Boundries dimensions - [time, value] 114 | boundaries = torch.tensor(emphases.baselines.prominence.loma.get_boundaries( 115 | max_loma, 116 | neg_loma, 117 | alignment)) 118 | 119 | return prominences[:, 1][None] 120 | 121 | 122 | ############################################################################### 123 | # Utilities 124 | ############################################################################### 125 | 126 | 127 | def normalize(features): 128 | """Normalize features""" 129 | return (features - np.nanmean(features)) / (np.nanstd(features) + 1e-7) 130 | 131 | 132 | def resample(signal, original_sample_rate, target_sample_rate): 133 | """Resample signal""" 134 | ratio = fractions.Fraction(target_sample_rate, original_sample_rate) 135 | return resample_poly(signal, ratio.numerator, ratio.denominator) 136 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/cwt_utils.py: -------------------------------------------------------------------------------- 1 | from numpy import array, sqrt, pad, mean, pi 2 | 3 | import pycwt as cwt 4 | 5 | 6 | ########################################################################################### 7 | # Private routines 8 | ########################################################################################### 9 | 10 | 11 | def _padded_cwt(params, dt, dj, s0, J, mother, padding_len): 12 | """Private function to compute a wavelet transform on padded data 13 | 14 | Parameters 15 | ---------- 16 | params: arraylike 17 | The prosodic parameters. 18 | dt: ? 19 | ? 20 | dj: ? 21 | ? 22 | s0: ? 23 | ? 24 | J: ? 25 | ? 26 | mother: ? 27 | The mother wavelet. 28 | padding_len: int 29 | The padding length 30 | 31 | Returns 32 | ------- 33 | wavelet_matrix: ndarray 34 | The wavelet data resulting from the analysis 35 | scales: arraylike 36 | The scale indices corresponding to the wavelet data 37 | freqs: ? 38 | ? 39 | coi: array 40 | The cone of influence values 41 | fft: ? 42 | ? 43 | fftfreqs: ? 44 | ? 45 | """ 46 | padded = pad(params, padding_len, mode='edge') 47 | wavelet_matrix, scales, freqs, coi, fft, fftfreqs = cwt.cwt( 48 | padded, 49 | dt, 50 | dj, 51 | s0, 52 | J, 53 | mother) 54 | wavelet_matrix = \ 55 | wavelet_matrix[:, padding_len:len(wavelet_matrix[0]) - padding_len] 56 | return wavelet_matrix, scales, freqs, coi, fft, fftfreqs 57 | 58 | 59 | def _zero_outside_coi(wavelet_matrix, freqs, rate=200): 60 | """Private function to set each elements outside of the Cone Of Influence (coi) to 0. 61 | 62 | Parameters 63 | ---------- 64 | wavelet_matrix: type 65 | description 66 | freqs: type 67 | description 68 | """ 69 | for i in range(0, wavelet_matrix.shape[0]): 70 | coi = int(1. / freqs[i] * rate) 71 | wavelet_matrix[i, :coi] = 0. 72 | wavelet_matrix[i, -coi:] = 0. 73 | return wavelet_matrix 74 | 75 | 76 | def _scale_for_reconstruction( 77 | wavelet_matrix, 78 | scales, 79 | dj, 80 | dt, 81 | mother='mexican_hat', 82 | period=3): 83 | """ ? 84 | 85 | Parameters 86 | ---------- 87 | wavelet_matrix: ndarray 88 | The wavelet data resulting from the analysis 89 | scales: arraylike 90 | The scale indices corresponding to the wavelet data 91 | dj: ? 92 | ? 93 | dt: ? 94 | ? 95 | mother: ? 96 | ? 97 | period: ? 98 | ? 99 | """ 100 | scaled = array(wavelet_matrix) 101 | 102 | # mexican Hat 103 | c = dj / (3.541 * .867) 104 | 105 | if mother == 'morlet': 106 | cc = 1.83 107 | #periods 5 and 6 are correct, 3,4 approximate 108 | if period == 3: 109 | cc = 1.74 110 | if period == 4: 111 | cc = 1.1 112 | elif period == 5: 113 | cc = .9484 114 | elif period == 6: 115 | cc = .7784 116 | c = dj / (cc * pi ** (-.25)) 117 | 118 | for i in range(0, len(scales)): 119 | scaled[i] *= c * sqrt(dt) / sqrt(scales[i]) 120 | # substracting the mean should not be necessary? 121 | scaled[i] -= mean(scaled[i]) 122 | 123 | return scaled 124 | 125 | 126 | def cwt_analysis( 127 | params, 128 | mother_name='mexican_hat', 129 | num_scales=12, 130 | first_scale=None, 131 | scale_distance=1., 132 | apply_coi=True, 133 | period=5, 134 | frame_rate=200): 135 | """Achieve the continous wavelet analysis of given parameters 136 | 137 | Parameters 138 | ---------- 139 | params: arraylike 140 | The parameters to analyze. 141 | mother_name: string, optional 142 | The name of the mother wavelet [default: mexican_hat]. 143 | num_scales: int, optional 144 | The number of scales [default: 12]. 145 | first_scale: int, optional 146 | The width of the shortest scale 147 | scale_distance: float, optional 148 | The distance between scales [default: 1.0]. 149 | apply_coi: boolean, optional 150 | Apply the Cone Of Influence (coi) 151 | period: int, optional 152 | The period of the mother wavelet [default: 5]. 153 | frame_rate: int, optional 154 | The signal frame rate [default: 200]. 155 | 156 | Returns 157 | ------- 158 | wavelet_matrix: ndarray 159 | The wavelet data resulting from the analysis 160 | scales: arraylike 161 | The scale indices corresponding to the wavelet data 162 | """ 163 | # setup wavelet transform 164 | dt = 1. / float(frame_rate) # frame length 165 | 166 | if not first_scale: 167 | first_scale = dt # first scale, here frame length 168 | 169 | dj = scale_distance # distance between scales in octaves 170 | J = num_scales # number of scales 171 | 172 | mother = cwt.MexicanHat() 173 | 174 | if str.lower(mother_name) == 'morlet': 175 | mother = cwt.Morlet(period) 176 | 177 | wavelet_matrix, scales, freqs, *_ = _padded_cwt( 178 | params, 179 | dt, 180 | dj, 181 | first_scale, 182 | J, 183 | mother, 184 | 400) 185 | wavelet_matrix = _scale_for_reconstruction( 186 | wavelet_matrix, 187 | scales, 188 | dj, 189 | dt, 190 | mother=mother_name, 191 | period=period) 192 | 193 | if apply_coi: 194 | wavelet_matrix = _zero_outside_coi(wavelet_matrix, freqs, frame_rate) 195 | 196 | return wavelet_matrix, scales, freqs 197 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/duration_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Constants 8 | ############################################################################### 9 | 10 | 11 | SILENCE_SYMBOLS = [ 12 | '#', 13 | '!pau', 14 | 'sp', 15 | '', 16 | 'pau', 17 | '!sil', 18 | 'sil', 19 | '', 20 | ' ', 21 | '

', 22 | '', 23 | '.', 24 | ',', 25 | '?', 26 | ''] 27 | 28 | 29 | ############################################################################### 30 | # Duration 31 | ############################################################################### 32 | 33 | 34 | def _get_dur_stats(labels, rate=200): 35 | durations = [] 36 | for i in range(len(labels)): 37 | st, en, unit = labels[i] 38 | st *= rate 39 | en *= rate 40 | if unit.lower() not in SILENCE_SYMBOLS: 41 | dur = en - st 42 | dur = np.log(dur + 1.) 43 | durations.append(dur) 44 | durations = np.array(durations) 45 | return np.min(durations), np.max(durations), np.mean(durations) 46 | 47 | 48 | def get_rate(params, hp=10, lp=150): 49 | """ 50 | estimation of speech rate as a center of gravity of wavelet spectrum 51 | similar to method described in "Boundary Detection using Continuous Wavelet Analysis" (2016) 52 | """ 53 | params = emphases.baselines.prominence.smooth_and_interp.smooth(params, hp) 54 | params -= emphases.baselines.prominence.smooth_and_interp.smooth(params, lp) 55 | 56 | wavelet_matrix, *_ = emphases.baselines.prominence.cwt_utils.cwt_analysis( 57 | params, 58 | mother_name='Morlet', 59 | num_scales=80, 60 | scale_distance=.1, 61 | apply_coi=True, 62 | period=2) 63 | wavelet_matrix = abs(wavelet_matrix) 64 | 65 | rate = np.zeros(len(params)) 66 | 67 | for i in range(0,wavelet_matrix.shape[1]): 68 | frame_en = np.sum(wavelet_matrix[:, i]) 69 | # center of gravity 70 | rate[i] = np.nonzero( 71 | wavelet_matrix[:, i].cumsum() >= frame_en * .5)[0].min() 72 | 73 | return emphases.baselines.prominence.smooth_and_interp.smooth(rate, 30) 74 | 75 | 76 | def duration(labels, rate=200): 77 | """Construct duration signal from labels""" 78 | dur = np.zeros(len(labels)) 79 | params = np.zeros(int(labels[-1][1] * rate)) 80 | prev_end = 0 81 | min_dur, *_ = _get_dur_stats(labels, rate=200) 82 | for i in range(0, len(labels)): 83 | st, en, unit = labels[i] 84 | st *= rate 85 | en *= rate 86 | dur[i] = en - st 87 | dur[i] = np.log(dur[i] + 1.) 88 | 89 | if unit.lower() in SILENCE_SYMBOLS: 90 | dur[i] = min_dur 91 | 92 | # skip very short units, likely labelling errors 93 | if en <= st + .01: 94 | continue 95 | 96 | # unit duration -> height of the duration contour in the middle of the unit 97 | index = min(len(params) - 1, int(st + (en - st) / 2.)) 98 | params[index] = dur[i] 99 | 100 | # Handle gaps in labels similarly to silences 101 | if st > prev_end and i > 1: 102 | params[int(prev_end + (st - prev_end) / 2.)] = min_dur 103 | prev_end = en 104 | 105 | # set endpoints to mean in order to avoid large "valleys" 106 | params[0] = np.mean(dur) 107 | params[-1] = np.mean(dur) 108 | 109 | # make continous duration contour and smooth a bit 110 | params = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(params, 'pchip') 111 | return emphases.baselines.prominence.smooth_and_interp.smooth(params, 20) 112 | 113 | 114 | def get_duration_signal( 115 | alignment, 116 | weights=[], 117 | rate=1): 118 | """ 119 | Construct duration contour from labels. If many tiers are selected, 120 | construct contours for each tier and return a weighted sum of those 121 | """ 122 | word_tier = [(word.start(), word.end(), str(word)) for word in alignment] 123 | phoneme_tier = [ 124 | (phoneme.start(), phoneme.end(), str(phoneme)) 125 | for phoneme in alignment.phonemes()] 126 | tiers = [phoneme_tier, word_tier] 127 | 128 | durations = [] 129 | 130 | for tier in tiers: 131 | durations.append( 132 | emphases.baselines.prominence.normalize( 133 | duration(tier, rate=rate))) 134 | durations = match_length(durations) 135 | sum_durations = np.zeros(len(durations[0])) 136 | if len(weights) != len(tiers): 137 | weights = np.ones(len(tiers)) 138 | for i in range(len(durations)): 139 | sum_durations += durations[i] * weights[i] 140 | return sum_durations 141 | 142 | 143 | def match_length(sig_list): 144 | """Reduce length of all signals to a the minimum one. 145 | 146 | Parameters 147 | ---------- 148 | sig_list: list 149 | List of signals which are 1D array of samples. 150 | """ 151 | length = min(map(len, sig_list)) 152 | for i in range(0, len(sig_list)): 153 | sig_list[i] = sig_list[i][:int(length)] 154 | return sig_list 155 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/energy_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import emphases 4 | 5 | 6 | def extract_energy( 7 | waveform, 8 | sample_rate=16000, 9 | min_freq=emphases.PROMINENCE_ENERGY_MIN, 10 | max_freq=emphases.PROMINENCE_ENERGY_MAX, 11 | frame_rate=200): 12 | # Get butterworth bandpass filter parameters 13 | lp_waveform = emphases.baselines.prominence.filter.butter_bandpass_filter( 14 | waveform, 15 | min_freq, 16 | max_freq, 17 | sample_rate, 18 | order=5) 19 | 20 | # Compute energy 21 | energy = np.sqrt(lp_waveform ** 2) 22 | 23 | # Resample to frame rate 24 | return emphases.baselines.prominence.resample(energy, sample_rate, frame_rate) 25 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/f0_processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import emphases 4 | 5 | 6 | def rolling_window(a, window): 7 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 8 | strides = a.strides + (a.strides[-1],) 9 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 10 | 11 | 12 | def _cut_boundary_vals(params, num_vals): 13 | cutted = np.array(params) 14 | for i in range(num_vals, len(params) - num_vals): 15 | if params[i] <= 0 and params[i + 1] > 0: 16 | for j in range(i, i + num_vals): 17 | cutted[j] = 0. 18 | 19 | if params[i] > 0 and params[i + 1] <= 0: 20 | for j in range(i - num_vals, i + 1): 21 | cutted[j] = 0. 22 | 23 | return cutted 24 | 25 | 26 | def _remove_outliers(log_pitch): 27 | fixed = np.array(log_pitch) 28 | 29 | # Remove outlier f0 values from voicing boundaries 30 | boundary_cut = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros( 31 | _cut_boundary_vals(fixed, 3), 32 | 'linear') 33 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear') 34 | fixed[abs(interp - boundary_cut) > .1] = 0 35 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear') 36 | 37 | # iterative outlier removal 38 | # 1. compare current contour estimate to a smoothed contour and remove deviates larger than threshold 39 | # 2. smooth current estimate with shorter window, thighten threshold 40 | # 3. goto 1. 41 | 42 | # In practice, first handles large scale octave jump type errors, 43 | # finally small scale 'errors' like consonant perturbation effects and 44 | # other irregularities in voicing boundaries 45 | # 46 | # if this appears to remove too many correct values, increase thresholds 47 | num_iter = 30 48 | max_win_len = 100 49 | min_win_len = 10 50 | max_threshold = 3. # threshold with broad window 51 | min_threshold = .5 # threshold with shorted window 52 | 53 | _std = np.std(interp) 54 | # do not tie fixing to liveliness of the original 55 | _std = .3 56 | 57 | win_len = np.exp( 58 | np.linspace(np.log(max_win_len), np.log(min_win_len), num_iter + 1)) 59 | outlier_threshold = np.linspace( 60 | _std * max_threshold, 61 | _std * min_threshold, 62 | num_iter + 1) 63 | for i in range(0, num_iter): 64 | smooth_contour = emphases.baselines.prominence.smooth_and_interp.smooth(interp, win_len[i]) 65 | low_limit = smooth_contour - outlier_threshold[i] 66 | # bit more careful upwards, not to cut emphases 67 | hi_limit = smooth_contour + outlier_threshold[i] * 1.5 68 | 69 | # octave jump down fix, more harm than good? 70 | fixed[interp > hi_limit] = 0 71 | fixed[interp < low_limit] = 0 72 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(fixed, 'linear') 73 | 74 | return fixed 75 | 76 | 77 | def _interpolate(f0): 78 | interp = emphases.baselines.prominence.smooth_and_interp.interpolate_zeros(f0) 79 | _std = np.std(interp) 80 | _min = np.min(interp) 81 | low_limit = emphases.baselines.prominence.smooth_and_interp.smooth(interp, 200) - 1.5 * _std 82 | low_limit[low_limit < _min] = _min 83 | hi_limit = emphases.baselines.prominence.smooth_and_interp.smooth(interp, 100) + 2. * _std 84 | voicing = np.array(f0) 85 | constrained = np.array(f0) 86 | constrained = np.maximum(f0, low_limit) 87 | constrained = np.minimum(constrained, hi_limit) 88 | interp = emphases.baselines.prominence.smooth_and_interp.peak_smooth( 89 | constrained, 90 | 100, 91 | 20, 92 | voicing=voicing) 93 | # smooth voiced parts a bit too 94 | return emphases.baselines.prominence.smooth_and_interp.peak_smooth(interp, 3, 2) 95 | 96 | 97 | def process(f0): 98 | log_pitch = np.array(f0) 99 | log_scaled = True 100 | if np.mean(f0[f0 > 0]) > 20: 101 | log_scaled = False 102 | log_pitch[f0 > 0] = np.log(f0[f0 > 0]) 103 | log_pitch[f0 <= 0] = 0 104 | 105 | log_pitch = _remove_outliers(log_pitch) 106 | log_pitch = _interpolate(log_pitch) 107 | 108 | if not log_scaled: 109 | return np.exp(log_pitch) 110 | else: 111 | return log_pitch 112 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/filter.py: -------------------------------------------------------------------------------- 1 | from scipy.signal import butter, lfilter 2 | 3 | 4 | def butter_bandpass(lowcut, highcut, fs, order=5): 5 | """Generate the butter bandpass filter 6 | 7 | For more details see scipy.signal.butter documentation 8 | 9 | Parameters 10 | ---------- 11 | lowcut: int 12 | The low cut value 13 | highcut: type 14 | description 15 | fs: int 16 | Signal sample rate 17 | order: int 18 | Order of the butter fiter 19 | 20 | Returns 21 | ------- 22 | b: arraylike 23 | Numerator polynomial of the IIR filter 24 | a: arraylike 25 | Denominator polynomial of the IIR filter 26 | """ 27 | nyq = .5 * fs 28 | low = lowcut / nyq 29 | if highcut >= nyq * .95: 30 | highcut = nyq * .95 31 | high = highcut / nyq 32 | b, a = butter(order, [low, high], btype='band') 33 | return b, a 34 | 35 | 36 | def butter_bandpass_filter(data, lowcut, highcut, fs, order=5): 37 | """Filter signal data using a butter filter type 38 | 39 | For more details see scipy.signal.butter and scipy.signal.lfilter documentation 40 | 41 | Parameters 42 | ---------- 43 | data: arraylike 44 | An N-dimensional input array. 45 | lowcut: int 46 | The lowcut filtering value. 47 | highcut: type 48 | The highcut filtering value. 49 | fs: int 50 | The signal sample rate. 51 | order: int 52 | The order of the butter filter. 53 | 54 | Returns 55 | ------- 56 | arraylike 57 | An N-dimensional filtered array 58 | """ 59 | b, a = butter_bandpass(lowcut, highcut, fs, order=order) 60 | return lfilter(b, a, data) 61 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/loma.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from operator import itemgetter 3 | 4 | 5 | def simplify(loma): 6 | """? 7 | Parameters 8 | ---------- 9 | loma: type 10 | description 11 | """ 12 | simplified = [] 13 | for l in loma: 14 | # align loma to it's position in the middle of the line 15 | pos = l[int(len(l) / 2.)][0] 16 | strength = l[-1][1] 17 | simplified.append((pos, strength)) 18 | return simplified 19 | 20 | 21 | def get_prominences(pos_loma, alignment, rate=1): 22 | """? 23 | Parameters 24 | ---------- 25 | pos_loma: list of ? 26 | Positive loma values 27 | labels: list of tuple (float, float, string) 28 | List of labels which are lists of 3 elements [start, end, description] 29 | """ 30 | max_word_loma = [] 31 | loma = simplify(pos_loma) 32 | for st, end in [(word.start(), word.end()) for word in alignment]: 33 | st *= rate 34 | end *= rate 35 | word_loma = [] 36 | for l in loma: 37 | if l[0] >= st and l[0] <= end: 38 | word_loma.append(l) 39 | if len(word_loma) > 0: 40 | max_word_loma.append(sorted(word_loma, key=itemgetter(1))[-1]) 41 | else: 42 | max_word_loma.append([st + (end - st) / 2., 0.]) 43 | return max_word_loma 44 | 45 | 46 | def get_boundaries(max_word_loma, boundary_loma, alignment): 47 | """get strongest lines of minimum amplitude between adjacent words' max lines""" 48 | boundary_loma = simplify(boundary_loma) 49 | max_boundary_loma = [] 50 | st = 0 51 | end = 0 52 | for i in range(1, len(max_word_loma)): 53 | w_boundary_loma = [] 54 | for l in boundary_loma: 55 | st = max_word_loma[i - 1][0] 56 | end = max_word_loma[i][0] 57 | if l[0] >= st and l[0] < end: 58 | if l[1] > 0: 59 | w_boundary_loma.append(l) 60 | 61 | if len(w_boundary_loma) > 0: 62 | max_boundary_loma.append( 63 | sorted(w_boundary_loma, key=itemgetter(1))[-1]) 64 | else: 65 | max_boundary_loma.append([st + (end - st) / 2, 0]) 66 | 67 | # final boundary is not estimated 68 | max_boundary_loma.append((alignment.end(), 1)) 69 | 70 | return max_boundary_loma 71 | 72 | 73 | def _get_parent(child_index, parent_diff, parent_indices): 74 | """Private function to find the parent of the given child peak. At child peak index, follow the 75 | slope of parent scale upwards to find parent 76 | 77 | Parameters 78 | ---------- 79 | child_index: int 80 | Index of the current child peak 81 | parent_diff: list of ? 82 | ? 83 | parent_indices: list of int ? 84 | Indices of available parents 85 | 86 | Returns 87 | _______ 88 | int 89 | The parent index or None if there is no parent 90 | """ 91 | for i in range(0, len(parent_indices)): 92 | if parent_indices[i] > child_index: 93 | if parent_diff[int(child_index)] > 0: 94 | return parent_indices[i] 95 | else: 96 | if i > 0: 97 | return parent_indices[i - 1] 98 | else: 99 | return parent_indices[0] 100 | 101 | if len(parent_indices) > 0: 102 | return parent_indices[-1] 103 | 104 | 105 | def get_loma(wavelet_matrix, scales, min_scale, max_scale): 106 | """Get the Line Of Maximum Amplitude (loma) 107 | 108 | Parameters 109 | ---------- 110 | wavelet_matrix: matrix of float 111 | The wavelet matrix 112 | scales: list of int 113 | The list of scales 114 | min_scale: int 115 | The minimum scale 116 | max_scale: int 117 | The maximum scale 118 | 119 | Returns 120 | ------- 121 | list of tuples 122 | ? 123 | 124 | Note 125 | ---- 126 | change this so that one level is done in one chunk, not one parent. 127 | """ 128 | min_peak = -10000. # minimum peak amplitude to consider. NOTE:this has no meaning unless scales normalized 129 | max_dist = 10 # how far in time to look for parent peaks. NOTE: frame rate and scale dependent 130 | 131 | # get peaks from the first scale 132 | peaks, indices = get_peaks(wavelet_matrix[min_scale], min_peak) 133 | 134 | loma = dict() 135 | root = dict() 136 | for i in range(0, len(peaks)): 137 | loma[indices[i]] = [] 138 | 139 | # keep track of roots of each loma 140 | root[indices[i]] = indices[i] 141 | 142 | for i in range(min_scale + 1, max_scale): 143 | max_dist = np.sqrt(scales[i]) * 4 144 | 145 | # find peaks in the parent scale 146 | p_peaks, p_indices = get_peaks(wavelet_matrix[i], min_peak) 147 | parents = dict(zip(p_indices, p_peaks)) 148 | 149 | # find a parent for each child peak 150 | children = dict() 151 | for p in p_indices: 152 | children[p] = [] 153 | 154 | parent_diff = np.diff(wavelet_matrix[i], 1) 155 | for j in range(0, len(indices)): 156 | parent =_get_parent(indices[j], parent_diff, p_indices) 157 | if parent: 158 | if abs(parent - indices[j]) < max_dist and peaks[j] > min_peak: 159 | children[parent].append([indices[j], peaks[j]]) 160 | 161 | # for each parent, select max child 162 | peaks = [] 163 | indices = [] 164 | for p in children: 165 | if len(children[p]) > 0: 166 | maxi = sorted(children[p], key=itemgetter(1))[-1] 167 | indices.append(p) 168 | peaks.append(maxi[1] + parents[p]) 169 | 170 | #append child to correct loma 171 | loma[root[maxi[0]]].append([maxi[0], maxi[1] + parents[p], i, p]) 172 | root[p] = root[maxi[0]] 173 | 174 | sorted_loma = [] 175 | for k in sorted(loma.keys()): 176 | if len(loma[k]) > 0: 177 | sorted_loma.append(loma[k]) 178 | 179 | return sorted_loma 180 | 181 | 182 | def get_peaks(params, threshold=-10): 183 | """Find the peaks based on the given prosodic parameters. 184 | 185 | Parameters 186 | ---------- 187 | params: ? 188 | Prosodic parameters 189 | threshold: int 190 | description 191 | 192 | Returns 193 | ------- 194 | peaks: arraylike 195 | array of peak values and peak indices 196 | """ 197 | indices = (np.diff(np.sign(np.diff(params))) < 0).nonzero()[0] + 1 198 | peaks = params[indices] 199 | return np.array([peaks[peaks > threshold], indices[peaks > threshold]]) 200 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/pitch_tracker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import scipy.signal 4 | 5 | import emphases 6 | 7 | 8 | def _get_f0(spec, energy, min_hz, max_hz, thresh, sil_thresh): 9 | """ 10 | return frequency bin with maximum energy, if it is over given threshold 11 | and overall energy of the frame is over silence threshsold 12 | otherwise return 0 (unvoiced) 13 | """ 14 | cand = int(min_hz) + np.argmax(spec[int(min_hz):int(max_hz)]) 15 | if spec[cand] > thresh and energy > sil_thresh: 16 | if cand > 2 * min_hz and spec[int(round(cand / 2.))] > spec[cand] * .5: 17 | return int(round(cand / 2.)) 18 | else: 19 | return cand 20 | return 0 21 | 22 | 23 | def _track_pitch( 24 | pic, 25 | min_hz=50, 26 | max_hz=450, 27 | thresh=.1, 28 | energy_thresh=1.): 29 | """ 30 | extract pitch contour from time-frequency image 31 | bin with maximum energy / frame is chosen as a first f0 estimate, 32 | following with refinement steps based on the assumption of continuity of the pitch track 33 | """ 34 | pitch = np.zeros(pic.shape[0]) 35 | 36 | # calc energy threshold for voicing 37 | log_energy = np.log(np.sum(pic, axis=1)) 38 | energy_thresh = \ 39 | np.min(emphases.baselines.prominence.smooth_and_interp.smooth(log_energy, 20)) + energy_thresh 40 | pic_smooth = pic * scipy.ndimage.gaussian_filter(pic, [2, 5]) 41 | 42 | # find frequency bins with max_energy 43 | for i in range(0, pic_smooth.shape[0]): 44 | pitch[i] = _get_f0( 45 | pic_smooth[i], 46 | log_energy[i], 47 | min_hz, 48 | max_hz, 49 | thresh, 50 | energy_thresh) 51 | 52 | # second pass with soft constraints 53 | n_iters = 3 54 | from scipy.signal import gaussian 55 | 56 | for iter in range(0, n_iters): 57 | smoothed = emphases.baselines.prominence.f0_processing.process(pitch) 58 | smoothed = emphases.baselines.prominence.smooth_and_interp.smooth(smoothed, int(200. / (iter + 1.))) 59 | 60 | # gradually thightening gaussian window centered on current estimate to softly constrain next iteration 61 | win_len = 800 62 | g_window = gaussian(win_len, int(np.mean(smoothed) * (1. / (iter + 1.) ** 2))) 63 | 64 | for i in range(0, pic.shape[0]): 65 | window = np.zeros(len(pic_smooth[i])) 66 | st = int(np.max((0, int(smoothed[i] - win_len)))) 67 | end = int(np.min((int(smoothed[i] + win_len * .5), win_len - st))) 68 | window[st:end] = g_window[win_len - end:] 69 | pitch[i] = _get_f0( 70 | pic_smooth[i] * window, log_energy[i], 71 | min_hz, 72 | max_hz, 73 | thresh, 74 | energy_thresh) 75 | 76 | return pitch 77 | 78 | 79 | def _assign_to_bins(pic, freqs, mags): 80 | for i in range(1, freqs.shape[0] - 1): 81 | for j in range(0, freqs.shape[1]): 82 | try: 83 | pic[j, int(freqs[i, j])] += mags[i, j] 84 | except: 85 | pass 86 | 87 | 88 | def inst_freq_pitch( 89 | wav_form, 90 | fs, 91 | min_hz=emphases.FMIN, 92 | max_hz=emphases.FMAX, 93 | voicing_thresh=emphases.VOICED_THRESHOLD, 94 | target_rate=200): 95 | """Extract speech f0 using the continuous wavelet transform""" 96 | voicing_thresh = (voicing_thresh - 50.) / 100. 97 | sample_rate = 4000 98 | tmp_wav_form = emphases.baselines.prominence.resample(wav_form, fs, sample_rate) 99 | tmp_wav_form = emphases.baselines.prominence.normalize(tmp_wav_form) 100 | 101 | DEC = int(round(sample_rate / target_rate)) 102 | 103 | pic = np.zeros( 104 | shape=(int(len(tmp_wav_form) / float(DEC)), int(sample_rate / 4.))) 105 | 106 | # use continuous wavelet transform to get instantenous frequencies 107 | # integrate analyses with morlet mother wavelets with period = 5 for 108 | # good time and frequency resolution 109 | # setup wavelet 110 | s0 = 2. / sample_rate 111 | dj = .05 # 20 scales per octave 112 | J = 120 # six octaves 113 | dt = 1. / sample_rate 114 | periods = [5] 115 | for p in periods: 116 | wavelet_matrix, *_ = emphases.baselines.prominence.cwt_utils.cwt_analysis( 117 | tmp_wav_form, 118 | mother_name='morlet', 119 | first_scale=s0, 120 | num_scales=J, 121 | scale_distance=dj, 122 | apply_coi=False, 123 | period=p, 124 | frame_rate=sample_rate) 125 | 126 | # hilbert transform 127 | phase = np.unwrap(np.angle(wavelet_matrix), axis=1) 128 | freqs = np.abs((np.gradient(phase, dt)[1]) / (2. * np.pi)) 129 | 130 | freqs = scipy.signal.decimate(freqs, DEC, zero_phase=True) 131 | mags = scipy.signal.decimate(abs(wavelet_matrix), DEC, zero_phase=True) 132 | 133 | # normalize magnitudes 134 | mags = (mags - mags.min()) / mags.ptp() 135 | 136 | # construct time-frequency image 137 | _assign_to_bins(pic, freqs, mags) 138 | 139 | # perform frequency domain autocorrelation to enhance f0 140 | pic = scipy.ndimage.filters.gaussian_filter(pic, [1, 1]) 141 | length = np.min((max_hz * 3, pic.shape[1])).astype(int) 142 | 143 | for i in range(0, pic.shape[0]): 144 | acorr1 = np.correlate(pic[i, :length], pic[i, :length], mode='same') 145 | pic[i, :int(length / 2.)] *= acorr1[int(len(acorr1) / 2.):] 146 | 147 | return _track_pitch(pic, min_hz, max_hz, voicing_thresh) 148 | -------------------------------------------------------------------------------- /emphases/baselines/prominence/smooth_and_interp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy import interpolate 3 | 4 | 5 | def remove_bias(params, win_len=300): 6 | return params - smooth(params, win_len) 7 | 8 | 9 | def interpolate_zeros(params, method='pchip', min_val=0): 10 | """ 11 | Interpolate 0 values 12 | :param params: 1D data vector 13 | :param method: 14 | :param factor: factor for interpolation (must be integer) 15 | :return: interpolated 1D vector by a given factor 16 | """ 17 | voiced = np.array(params, float) 18 | for i in range(0, len(voiced)): 19 | if voiced[i] == min_val: 20 | voiced[i] = np.nan 21 | 22 | if np.isnan(voiced[-1]): 23 | voiced[-1] = np.nanmin(voiced) 24 | if np.isnan(voiced[0]): 25 | voiced[0] = np.nanmean(voiced) 26 | 27 | not_nan = np.logical_not(np.isnan(voiced)) 28 | 29 | indices = np.arange(len(voiced)) 30 | if method == 'spline': 31 | interp = interpolate.UnivariateSpline( 32 | indices[not_nan], 33 | voiced[not_nan], 34 | k=2, 35 | s=0) 36 | # return voiced parts intact 37 | smoothed = interp(indices) 38 | for i in range(0, len(smoothed)): 39 | if not np.isnan(voiced[i]): 40 | smoothed[i] = params[i] 41 | return smoothed 42 | 43 | elif method == 'pchip': 44 | interp = interpolate.pchip(indices[not_nan], voiced[not_nan]) 45 | else: 46 | interp = interpolate.interp1d( 47 | indices[not_nan], 48 | voiced[not_nan], 49 | method) 50 | return interp(indices) 51 | 52 | 53 | def smooth(params, win, type='HAMMING'): 54 | """gaussian type smoothing, convolution with hamming window""" 55 | win = int(win + .5) 56 | if win >= len(params) - 1: 57 | win = len(params) - 1 58 | 59 | if win % 2 == 0: 60 | win += 1 61 | 62 | s = np.r_[params[win - 1:0:-1], params, params[-1:-win:-1]] 63 | 64 | if type == 'HAMMING': 65 | w = np.hamming(win) 66 | else: 67 | w = np.ones(win) 68 | 69 | y = np.convolve(w / w.sum(), s, mode='valid') 70 | return y[int(win / 2):-int(win / 2)] 71 | 72 | 73 | def peak_smooth(params, max_iter, win, min_win=2, voicing=[]): 74 | """Iterative smoothing while preserving peaks, 'true envelope' -style""" 75 | smoothed = np.array(params) 76 | win_reduce = np.exp(np.linspace(np.log(win), np.log(min_win), max_iter)) 77 | 78 | for i in range(0, max_iter): 79 | 80 | smoothed = np.maximum(params, smoothed) 81 | 82 | if len(voicing) > 0: 83 | smoothed = smooth(smoothed, int(win + .5)) 84 | smoothed[voicing > 0] = params[voicing > 0] 85 | else: 86 | smoothed = smooth(smoothed, int(win + .5), type='rectangle') 87 | 88 | win = win_reduce[i] 89 | 90 | return smoothed 91 | -------------------------------------------------------------------------------- /emphases/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/emphases/config/__init__.py -------------------------------------------------------------------------------- /emphases/config/defaults.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import torch 5 | import GPUtil 6 | 7 | 8 | ############################################################################### 9 | # Metadata 10 | ############################################################################### 11 | 12 | 13 | # Configuration name 14 | CONFIG = 'emphases' 15 | 16 | 17 | ############################################################################### 18 | # Directories 19 | ############################################################################### 20 | 21 | 22 | # Location to save assets to be bundled with pip release 23 | ASSETS_DIR = Path(__file__).parent.parent / 'assets' 24 | 25 | # Location of preprocessed features 26 | CACHE_DIR = Path(__file__).parent.parent.parent / 'data' / 'cache' 27 | 28 | # Location of datasets on disk 29 | DATA_DIR = Path(__file__).parent.parent.parent / 'data' / 'datasets' 30 | 31 | # Location to save evaluation artifacts 32 | EVAL_DIR = Path(__file__).parent.parent.parent / 'eval' 33 | 34 | # Location to save training and adaptation artifacts 35 | RUNS_DIR = Path(__file__).parent.parent.parent / 'runs' 36 | 37 | # Location of compressed datasets on disk 38 | SOURCE_DIR = Path(__file__).parent.parent.parent / 'data' / 'sources' 39 | 40 | 41 | ############################################################################### 42 | # Audio parameters 43 | ############################################################################### 44 | 45 | 46 | # The maximum representable frequency 47 | FMAX = 550. 48 | 49 | # The minumum representable frequency 50 | FMIN = 40. 51 | 52 | # The number of samples between frames 53 | HOPSIZE = 160 54 | 55 | # Minimum decibel level 56 | MIN_DB = -100. 57 | 58 | # Number of linear frequency channels 59 | NUM_FFT = 1024 60 | 61 | # Number of mel channels 62 | NUM_MELS = 80 63 | 64 | # Voiced/unvoiced threshold for pitch estimation 65 | VOICED_THRESHOLD = .1625 66 | 67 | # Reference decibel level 68 | REF_DB = 20. 69 | 70 | # The audio samling rate 71 | SAMPLE_RATE = 16000 72 | 73 | # The size of the audio analysis window 74 | WINDOW_SIZE = 1024 75 | 76 | 77 | ############################################################################### 78 | # Data parameters 79 | ############################################################################### 80 | 81 | 82 | # List of all datasets 83 | DATASETS = ['libritts'] 84 | 85 | # Datasets to use for evaluation 86 | EVALUATION_DATASETS = ['libritts'] 87 | 88 | # Whether to use mel features 89 | MEL_FEATURE = True 90 | 91 | # Whether to use loudness features 92 | LOUDNESS_FEATURE = False 93 | 94 | # Maximum number of allowed annotations 95 | MAX_ANNOTATIONS = None 96 | 97 | # Maximum number of training utterances 98 | MAX_TRAINING_UTTERANCES = None 99 | 100 | # Minimum number of allowed annotations 101 | MIN_ANNOTATIONS = None 102 | 103 | # Normalize input representations 104 | NORMALIZE = False 105 | 106 | # Whether to use the specified one-eighth dataset for scaling law experiments 107 | ONE_EIGHTH_UTTERANCES = False 108 | 109 | # Whether to use pitch features 110 | PITCH_FEATURE = False 111 | 112 | # Whether to use periodicity features 113 | PERIODICITY_FEATURE = False 114 | 115 | # Seed for all random number generators 116 | RANDOM_SEED = 0 117 | 118 | # Size of each partition. Must add to 1. 119 | SPLIT_SIZE_TEST = .1 120 | SPLIT_SIZE_TRAIN = .8 121 | SPLIT_SIZE_VALID = .1 122 | 123 | # Dataset to use for training 124 | TRAINING_DATASET = 'libritts' 125 | 126 | # Dataset to use for validation 127 | VALIDATION_DATASET = 'libritts' 128 | 129 | 130 | ############################################################################### 131 | # Evaluation parameters 132 | ############################################################################### 133 | 134 | 135 | # Number of steps between logging to Tensorboard 136 | LOG_INTERVAL = 100 # steps 137 | 138 | # Number of steps to perform for tensorboard logging 139 | LOG_STEPS = 32 140 | 141 | # Number of examples to plot to Tensorboard during training 142 | PLOT_EXAMPLES = 2 143 | 144 | 145 | ############################################################################### 146 | # Wavelet baseline parameters 147 | ############################################################################### 148 | 149 | 150 | # Line of maximum amplitude bounds 151 | LOMA_BOUNDARY_START = -2 # octaves 152 | LOMA_BOUNDARY_END = 1 # octaves 153 | LOMA_PROMINENCE_START = -3 # octaves 154 | LOMA_PROMINENCE_END = 0 # octaves 155 | 156 | # Weight applied to the duration 157 | PROMINENCE_DURATION_WEIGHT = .5 158 | 159 | # Maximum frequency in energy calculation 160 | PROMINENCE_ENERGY_MAX = 5000. 161 | 162 | # Minimum frequency in energy calculation 163 | PROMINENCE_ENERGY_MIN = 200. 164 | 165 | # Weight applied to the energy 166 | PROMINENCE_ENERGY_WEIGHT = 1. 167 | 168 | # Weight applied to the pitch 169 | PROMINENCE_PITCH_WEIGHT = 1. 170 | 171 | # Voiced/unvoiced threshold from 0 (all voiced) to 100 (all unvoiced) 172 | VOICED_THRESHOLD = 50 173 | 174 | 175 | ############################################################################### 176 | # Model parameters 177 | ############################################################################### 178 | 179 | 180 | # Activation function to use in convolution model 181 | ACTIVATION_FUNCTION = torch.nn.ReLU 182 | 183 | # Model architecture. One of ['convolution', 'transformer']. 184 | ARCHITECTURE = 'convolution' 185 | 186 | # Model width 187 | CHANNELS = 80 188 | 189 | # Decoder convolution kernel size 190 | DECODER_KERNEL_SIZE = 3 191 | 192 | # Dropout probability (or None to not use dropout) 193 | DROPOUT = None 194 | 195 | # Location to perform resampling from frame resolution to word resolution. 196 | # One of ['inference', 'input', 'intermediate', 'loss']. 197 | DOWNSAMPLE_LOCATION = 'intermediate' 198 | 199 | # Method to use for resampling from frame resolution to word resolution. 200 | # One of ['average', 'center', 'max', 'sum']. 201 | DOWNSAMPLE_METHOD = 'sum' 202 | 203 | # Encoder convolution kernel size 204 | ENCODER_KERNEL_SIZE = 3 205 | 206 | # Number of network layers 207 | LAYERS = 6 208 | 209 | # Method to use for inference. One of 210 | # ['neural', 'pitch-variance', 'duration-variance', 'prominence]. 211 | METHOD = 'neural' 212 | 213 | # Method to use for resampling from word resolution to frame resolution. 214 | # One of ['linear', 'nearest']. 215 | UPSAMPLE_METHOD = 'linear' 216 | 217 | 218 | ############################################################################### 219 | # Training parameters 220 | ############################################################################### 221 | 222 | 223 | # Number of buckets of data lengths used by the sampler 224 | BUCKETS = 2 225 | 226 | # Loss function. One of ['bce', 'mse'] 227 | LOSS = 'bce' 228 | 229 | # Maximum number of frames in one batch 230 | MAX_TRAINING_FRAMES = 75000 231 | 232 | # Number of training steps 233 | NUM_STEPS = 6000 234 | 235 | # Number of data loading worker threads 236 | try: 237 | NUM_WORKERS = int(os.cpu_count() / max(1, len(GPUtil.getGPUs()))) 238 | except ValueError: 239 | NUM_WORKERS = os.cpu_count() 240 | -------------------------------------------------------------------------------- /emphases/config/static.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Files and directories 8 | ############################################################################### 9 | 10 | 11 | # Directory to save annotation artifacts 12 | ANNOTATION_DIR = emphases.SOURCE_DIR / 'crowdsource' 13 | 14 | # Default configuration file for emphasis annotation 15 | DEFAULT_ANNOTATION_CONFIG = emphases.ASSETS_DIR / 'configs' / 'annotate.yaml' 16 | 17 | # Location to save dataset partitions 18 | PARTITION_DIR = emphases.ASSETS_DIR / 'partitions' 19 | 20 | 21 | ############################################################################### 22 | # Audio parameters 23 | ############################################################################### 24 | 25 | 26 | # The hopsize in seconds 27 | HOPSIZE_SECONDS = emphases.HOPSIZE / emphases.SAMPLE_RATE 28 | 29 | # The maximum representable frequency in log-hz 30 | LOGFMAX = torch.log2(torch.tensor(emphases.FMAX)) 31 | 32 | # The minumum representable frequency in log-hz 33 | LOGFMIN = torch.log2(torch.tensor(emphases.FMIN)) 34 | 35 | 36 | ############################################################################### 37 | # Model parameters 38 | ############################################################################### 39 | 40 | 41 | # Number of input features to the model 42 | NUM_FEATURES = ( 43 | emphases.MEL_FEATURE * emphases.NUM_MELS + 44 | int(emphases.PITCH_FEATURE) + 45 | int(emphases.PERIODICITY_FEATURE) + 46 | int(emphases.LOUDNESS_FEATURE)) 47 | -------------------------------------------------------------------------------- /emphases/convert.py: -------------------------------------------------------------------------------- 1 | import emphases 2 | 3 | 4 | ############################################################################### 5 | # Time conversions 6 | ############################################################################### 7 | 8 | 9 | def frames_to_samples(frames): 10 | """Convert number of frames to samples""" 11 | return frames * emphases.HOPSIZE 12 | 13 | 14 | def frames_to_seconds(frames): 15 | """Convert number of frames to seconds""" 16 | return frames * emphases.HOPSIZE_SECONDS 17 | 18 | 19 | def seconds_to_frames(seconds): 20 | """Convert seconds to number of frames""" 21 | return samples_to_frames(seconds_to_samples(seconds)) 22 | 23 | 24 | def seconds_to_samples(seconds): 25 | """Convert seconds to number of samples""" 26 | return seconds * emphases.SAMPLE_RATE 27 | 28 | 29 | def samples_to_frames(samples): 30 | """Convert samples to number of frames""" 31 | return samples // emphases.HOPSIZE 32 | 33 | 34 | def samples_to_seconds(samples): 35 | """Convert number of samples to seconds""" 36 | return samples / emphases.SAMPLE_RATE 37 | -------------------------------------------------------------------------------- /emphases/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import download 2 | from . import preprocess 3 | from .collate import collate 4 | from .dataset import Dataset 5 | from .loader import loader 6 | from .sampler import sampler 7 | -------------------------------------------------------------------------------- /emphases/data/collate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Batch collation 8 | ############################################################################### 9 | 10 | 11 | def collate(batch): 12 | """Batch collation""" 13 | # Unpack 14 | features, scores, word_bounds, alignments, audios, stems = zip(*batch) 15 | 16 | # Get word lengths 17 | word_lengths = torch.tensor( 18 | [bounds.shape[-1] for bounds in word_bounds], 19 | dtype=torch.long) 20 | max_word_length = word_lengths.max().item() 21 | 22 | # Get frame lengths 23 | frame_lengths = torch.tensor( 24 | [feat.shape[-1] for feat in features], 25 | dtype=torch.long) 26 | max_frame_length = frame_lengths.max().item() 27 | 28 | # Network output lengths 29 | output_lengths = word_lengths 30 | max_output_length = max_word_length 31 | 32 | # Allocate padded tensors 33 | padded_features = torch.zeros( 34 | (len(features), emphases.NUM_FEATURES, max_frame_length)) 35 | padded_scores = torch.zeros((len(scores), 1, max_output_length)) 36 | padded_bounds = torch.zeros( 37 | (len(word_bounds), 2, max_word_length), 38 | dtype=torch.long) 39 | padded_audio = torch.zeros( 40 | (len(audios), 1, max_frame_length * emphases.HOPSIZE)) 41 | 42 | # Place batch in padded tensors 43 | for ( 44 | i, 45 | (bounds, audio, feat, score, frame_length, word_length, output_length) 46 | ) in enumerate( 47 | zip( 48 | word_bounds, 49 | audios, 50 | features, 51 | scores, 52 | frame_lengths, 53 | word_lengths, 54 | output_lengths) 55 | ): 56 | 57 | # Pad features 58 | padded_features[i, :, :frame_length] = feat 59 | 60 | # Pad scores 61 | padded_scores[i, :, :output_length] = score[:, :output_length] 62 | 63 | # Pad word bounds 64 | padded_bounds[i, :, :word_length] = bounds[:, :word_length] 65 | 66 | # Pad audio 67 | end_sample = frame_length * emphases.HOPSIZE 68 | padded_audio[i, :, :end_sample] = audio[:, :end_sample] 69 | 70 | return ( 71 | padded_features, 72 | frame_lengths, 73 | padded_bounds, 74 | word_lengths, 75 | padded_scores, 76 | alignments, 77 | padded_audio, 78 | stems) 79 | -------------------------------------------------------------------------------- /emphases/data/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy as np 4 | import pypar 5 | import torch 6 | import torchaudio 7 | 8 | import emphases 9 | 10 | 11 | ############################################################################### 12 | # Dataset 13 | ############################################################################### 14 | 15 | 16 | class Dataset(torch.utils.data.Dataset): 17 | 18 | def __init__(self, name, partition): 19 | self.cache = emphases.CACHE_DIR / name 20 | 21 | # Get list of stems 22 | with open(emphases.PARTITION_DIR / f'{name}.json') as file: 23 | self.stems = json.load(file)[partition] 24 | 25 | # Store lengths for bucketing 26 | audio_files = [ 27 | self.cache / 'audio' / f'{stem}.wav' for stem in self.stems] 28 | self.lengths = [ 29 | emphases.convert.samples_to_frames( 30 | torchaudio.info(audio_file).num_frames) 31 | for audio_file in audio_files] 32 | 33 | # Total number of frames 34 | self.frames = sum(self.lengths) 35 | 36 | def __getitem__(self, index): 37 | """Retrieve the indexth item""" 38 | stem = self.stems[index] 39 | 40 | # Load alignment 41 | alignment = pypar.Alignment( 42 | self.cache / 'alignment' / f'{stem}.TextGrid') 43 | 44 | # Compute word bounds 45 | bounds = alignment.word_bounds( 46 | emphases.SAMPLE_RATE, 47 | emphases.HOPSIZE, 48 | silences=True) 49 | word_bounds = torch.cat( 50 | [torch.tensor(bound)[None] for bound in bounds]).T 51 | 52 | # Load audio 53 | audio = emphases.load.audio(self.cache / 'audio' / f'{stem}.wav') 54 | 55 | features = [] 56 | 57 | # Load mels 58 | if emphases.MEL_FEATURE: 59 | features.append(torch.load(self.cache / 'mels' / f'{stem}.pt')) 60 | 61 | # Load pitch 62 | if emphases.PITCH_FEATURE: 63 | pitch = torch.load(self.cache / 'pitch' / f'{stem}-pitch.pt') 64 | if emphases.NORMALIZE: 65 | features.append( 66 | (torch.log2(pitch) - emphases.LOGFMIN) / 67 | (emphases.LOGFMAX - emphases.LOGFMIN)) 68 | else: 69 | features.append(torch.log2(pitch)) 70 | 71 | # Load periodicity 72 | if emphases.PERIODICITY_FEATURE: 73 | periodicity = torch.load( 74 | self.cache / 'pitch' / f'{stem}-periodicity.pt') 75 | features.append(periodicity) 76 | 77 | # Load loudness 78 | if emphases.LOUDNESS_FEATURE: 79 | loudness = torch.load(self.cache / 'loudness' / f'{stem}.pt') 80 | features.append(loudness) 81 | 82 | # Concatenate 83 | features = features[0] if len(features) == 1 else torch.cat(features) 84 | 85 | # Load per-word ground truth emphasis scores 86 | scores = torch.load(self.cache / 'scores' / f'{stem}.pt')[None] 87 | 88 | return features, scores, word_bounds, alignment, audio, stem 89 | 90 | def __len__(self): 91 | """Length of the dataset""" 92 | return len(self.stems) 93 | 94 | def buckets(self): 95 | """Partition indices into buckets based on length for sampling""" 96 | # Get the size of a bucket 97 | size = len(self) // emphases.BUCKETS 98 | 99 | # Get indices in order of length 100 | indices = np.argsort(self.lengths) 101 | lengths = np.sort(self.lengths) 102 | 103 | # Split into buckets based on length 104 | buckets = [ 105 | np.stack((indices[i:i + size], lengths[i:i + size])).T 106 | for i in range(0, len(self), size)] 107 | 108 | # Concatenate partial bucket 109 | if len(buckets) == emphases.BUCKETS + 1: 110 | residual = buckets.pop() 111 | buckets[-1] = np.concatenate((buckets[-1], residual), axis=0) 112 | 113 | return buckets 114 | -------------------------------------------------------------------------------- /emphases/data/download/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | -------------------------------------------------------------------------------- /emphases/data/download/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Download datasets 8 | ############################################################################### 9 | 10 | 11 | def parse_args(): 12 | """Parse command-line arguments""" 13 | parser = argparse.ArgumentParser(description='Download datasets') 14 | parser.add_argument( 15 | '--datasets', 16 | nargs='+', 17 | default=emphases.DATASETS, 18 | help='The datasets to download') 19 | parser.add_argument( 20 | '--gpu', 21 | type=int, 22 | help='The index of the gpu to run inference on') 23 | return parser.parse_known_args()[0] 24 | 25 | 26 | emphases.data.download.datasets(**vars(parse_args())) 27 | -------------------------------------------------------------------------------- /emphases/data/download/core.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import shutil 4 | import ssl 5 | import tarfile 6 | import urllib 7 | import yaml 8 | 9 | import pyfoal 10 | import pypar 11 | import torch 12 | import torchutil 13 | import torchaudio 14 | 15 | import emphases 16 | 17 | 18 | ############################################################################### 19 | # Constants 20 | ############################################################################### 21 | 22 | 23 | # List of tokens to filter from Buckeye annotations 24 | BUCKEYE_FILTER_LIST = [ 25 | '{B_TRANS}', 26 | '{E_TRANS}', 27 | '', 28 | '', 29 | '', 30 | '', 31 | '', 32 | '', 33 | '', 34 | '', 35 | '', 36 | '', 37 | '', 38 | '', 39 | '', 40 | '', 41 | '', 42 | '', 43 | ] 44 | 45 | # Speakers selected by sorting the train-clean-100 speakers by longest total 46 | # recording duration and manually selecting speakers with more natural, 47 | # conversational (as opposed to read) prosody 48 | LIBRITTS_SPEAKERS = [ 49 | # Top 5 Female (primarily by length) 50 | 40, 51 | 669, 52 | 4362, 53 | 5022, 54 | 8123, 55 | 56 | # Additional female speakers to get to 1/8th of train-clean-100 57 | 5022, 58 | 696, 59 | 6272, 60 | 5163, 61 | 62 | # Top 5 Male (primarily by length) 63 | 196, 64 | 460, 65 | 1355, 66 | 3664, 67 | 7067, 68 | 69 | # Additional male speakers to get to 1/8th of train-clean-100 70 | 405, 71 | 6437, 72 | 446, 73 | 4397 74 | ] 75 | 76 | 77 | ############################################################################### 78 | # Download datasets 79 | ############################################################################### 80 | 81 | 82 | @torchutil.notify('download') 83 | def datasets(datasets, gpu=None): 84 | """Download datasets""" 85 | for dataset in datasets: 86 | if dataset == 'automatic': 87 | automatic(gpu=gpu) 88 | elif dataset == 'buckeye': 89 | buckeye() 90 | elif dataset == 'crowdsource': 91 | crowdsource() 92 | elif dataset == 'libritts': 93 | libritts() 94 | else: 95 | raise ValueError(f'Dataset {dataset} is not defined') 96 | 97 | 98 | ############################################################################### 99 | # Individual dataset downloaders 100 | ############################################################################### 101 | 102 | 103 | def automatic(gpu=None): 104 | """Create dataset from trained model""" 105 | # Setup directories 106 | cache_directory = emphases.CACHE_DIR / 'automatic' 107 | cache_directory.mkdir(exist_ok=True, parents=True) 108 | 109 | # Create subdirectories 110 | features = ['alignment', 'audio', 'scores'] 111 | for feature in features: 112 | (cache_directory / feature).mkdir(exist_ok=True, parents=True) 113 | 114 | # Get files 115 | audio_files = list( 116 | (emphases.CACHE_DIR / 'libritts' / 'audio').rglob('*.wav')) 117 | stems = [file.stem for file in audio_files] 118 | 119 | # Copy from LibriTTS cache to annotation cache 120 | for stem in stems: 121 | 122 | # Copy audio 123 | audio_file = ( 124 | emphases.CACHE_DIR / 'automatic' / 'audio' / f'{stem}.wav') 125 | shutil.copyfile( 126 | emphases.CACHE_DIR / 'libritts' / 'audio' / f'{stem}.wav', 127 | audio_file) 128 | 129 | # Copy alignment 130 | shutil.copyfile( 131 | emphases.CACHE_DIR / 'libritts' / 'alignment' / f'{stem}.TextGrid', 132 | emphases.CACHE_DIR / 'automatic' / 'alignment' / f'{stem}.TextGrid') 133 | 134 | # Load alignment 135 | alignment = pypar.Alignment( 136 | emphases.CACHE_DIR / 'automatic' / 'alignment' / f'{stem}.TextGrid') 137 | 138 | # Load audio 139 | audio, _ = torchaudio.load(audio_file) 140 | 141 | # Infer scores 142 | scores = emphases.from_alignment_and_audio( 143 | alignment, 144 | audio, 145 | emphases.SAMPLE_RATE, 146 | gpu=gpu).detach().cpu() 147 | 148 | # Save scores 149 | torch.save(scores, cache_directory / 'scores' / f'{stem}.pt') 150 | 151 | 152 | def crowdsource(): 153 | """Prepare crowdsourced dataset""" 154 | # Get annotation config 155 | with open(emphases.DEFAULT_ANNOTATION_CONFIG, "r") as stream: 156 | annotation_config = yaml.safe_load(stream) 157 | 158 | # Setup directories 159 | data_directory = emphases.DATA_DIR / 'crowdsource' 160 | cache_directory = emphases.CACHE_DIR / 'crowdsource' 161 | cache_directory.mkdir(exist_ok=True, parents=True) 162 | 163 | # Create subdirectories 164 | features = ['alignment', 'audio', 'scores'] 165 | for feature in features: 166 | (cache_directory / feature).mkdir(exist_ok=True, parents=True) 167 | 168 | # Load annotations data 169 | annotation_data = {} 170 | for directory in data_directory.glob('*'): 171 | 172 | source_directory = directory / annotation_config['name'] 173 | table_directory = source_directory / 'tables' 174 | 175 | # Participant data 176 | participants = {} 177 | with open(table_directory / 'participants.csv') as file: 178 | for row in csv.DictReader(file): 179 | try: 180 | 181 | # Crowdsourced annotation 182 | participants[row['ID']] = { 183 | 'language': row['Language'], 184 | 'country': row['Country'], 185 | 'annotations': []} 186 | 187 | except KeyError as error: 188 | 189 | # Manual annotation 190 | participants[row['ID']] = { 191 | 'language': 'English', 192 | 'country': 'United States', 193 | 'annotations': []} 194 | 195 | # Response data 196 | with open(table_directory / 'responses.csv') as file: 197 | for row in csv.DictReader(file): 198 | participant = row['Participant'] 199 | 200 | # Add participant 201 | if participant not in annotation_data: 202 | annotation_data[participant] = participants[participant] 203 | 204 | # Get word start and end times 205 | alignment = pypar.Alignment( 206 | emphases.CACHE_DIR / 207 | 'libritts' / 208 | 'alignment' / 209 | f'{row["Stem"]}.TextGrid') 210 | words = [ 211 | (str(word).lower(), word.start(), word.end()) 212 | for word in alignment 213 | if str(word) != pypar.SILENCE] 214 | 215 | # Format annotation 216 | entry = { 217 | 'stem': row['Stem'], 218 | 'score': [float(c) for c in row['Response']], 219 | 'words': words} 220 | assert len(entry['words']) == len(entry['score']) 221 | 222 | # Add annotation 223 | annotation_data[participant]['annotations'].append(entry) 224 | 225 | # Get worker ID correspondence 226 | correspondence = {} 227 | for directory in data_directory.glob('*'): 228 | file = ( 229 | directory / 230 | annotation_config['name'] / 231 | 'crowdsource' / 232 | 'crowdsource.json') 233 | with open(file) as file: 234 | contents = json.load(file) 235 | for content in contents: 236 | correspondence |= {content['ParticipantID']: content['WorkerId']} 237 | 238 | # Crowdsourced annotation 239 | if correspondence: 240 | 241 | # Filter out where incomplete or > 1/3 examples have > 2/3 words selected 242 | def valid(items): 243 | if not hasattr(valid, 'count'): 244 | valid.count = 0 245 | sums = [sum(item['score']) for item in items] 246 | counts = [len(item['score']) for item in items] 247 | invalids = [s > .67 * c for s, c in zip(sums, counts)] 248 | is_valid = sum(invalids) < .33 * len(invalids) 249 | valid.count += 1 - int(is_valid) 250 | return is_valid 251 | 252 | # Join participants with same worker ID 253 | joined = {} 254 | for participant, contents in annotation_data.items(): 255 | 256 | # Filter out bad batches 257 | if ( 258 | len(contents['annotations']) < 20 or 259 | len(contents['annotations']) % 10 > 0 or 260 | not valid(contents['annotations']) 261 | ): 262 | continue 263 | 264 | worker = correspondence[participant] 265 | if worker in joined: 266 | joined[worker]['annotations'].extend(contents['annotations']) 267 | else: 268 | joined[worker] = contents 269 | 270 | # Manual annotation 271 | else: 272 | joined = annotation_data 273 | 274 | # Anonymize 275 | anonymized = {} 276 | for i, contents in enumerate(joined.values()): 277 | anonymized[f'{i:06d}'] = contents 278 | 279 | # Save annotations in release format 280 | with open(cache_directory / 'annotations.json', 'w') as file: 281 | json.dump(anonymized, file, sort_keys=True, indent=True) 282 | 283 | # Merge binary annotations to floats 284 | annotations = merge_annotations(anonymized) 285 | 286 | # Save dictionary containing annotation counts 287 | with open(cache_directory / 'counts.json', 'w') as file: 288 | json.dump(annotations['stems'], file, sort_keys=True, indent=True) 289 | 290 | # Get annotated stems 291 | stems = [ 292 | file.replace('libritts-', '') 293 | for file in annotations['stems'].keys()] 294 | 295 | # Copy from LibriTTS cache to annotation cache 296 | for i, stem in enumerate(stems): 297 | 298 | # Get normalized scores 299 | count = annotations['stems'][stem] 300 | labels = [score / count for score in annotations['scores'][stem]] 301 | 302 | # Copy audio 303 | shutil.copyfile( 304 | emphases.CACHE_DIR / 'libritts' / 'audio' / f'{stem}.wav', 305 | emphases.CACHE_DIR / 'crowdsource' / 'audio' / f'{stem}.wav') 306 | 307 | # Copy alignment 308 | shutil.copyfile( 309 | emphases.CACHE_DIR / 'libritts' / 'alignment' / f'{stem}.TextGrid', 310 | emphases.CACHE_DIR / 'crowdsource' / 'alignment' / f'{stem}.TextGrid') 311 | 312 | # Load alignment 313 | alignment = pypar.Alignment( 314 | emphases.CACHE_DIR / 'crowdsource' / 'alignment' / f'{stem}.TextGrid') 315 | 316 | # Match alignment and scores (silences get a score of zero) 317 | j = 0 318 | scores = torch.zeros(len(alignment)) 319 | for i, word in enumerate(alignment): 320 | 321 | # Keep silences as zero 322 | if str(word) == pypar.SILENCE: 323 | continue 324 | 325 | # Update scores 326 | scores[i] = float(labels[j]) 327 | 328 | j += 1 329 | 330 | # Save scores 331 | torch.save(scores, cache_directory / 'scores' / f'{stem}.pt') 332 | 333 | 334 | def buckeye(): 335 | """Download buckeye dataset""" 336 | # Extract tar file to data directory 337 | file = emphases.SOURCE_DIR / 'buckeye' / 'buckeye.tar.gz' 338 | with tarfile.open(file, 'r:gz') as tfile: 339 | tfile.extractall(emphases.DATA_DIR) 340 | 341 | # Setup cache directory 342 | cache_directory = emphases.CACHE_DIR / 'buckeye' 343 | cache_directory.mkdir(exist_ok=True, parents=True) 344 | 345 | # Create subdirectories 346 | features = ['alignment', 'audio', 'scores'] 347 | for feature in features: 348 | (cache_directory / feature).mkdir(exist_ok=True, parents=True) 349 | 350 | # Copy alignments and filter out unused tokens 351 | data_directory = emphases.DATA_DIR / 'buckeye' 352 | alignment_files = (data_directory / 'alignment').glob('*.TextGrid') 353 | for file in alignment_files: 354 | 355 | # Load alignment 356 | alignment = pypar.Alignment(file) 357 | 358 | # Filter 359 | for word in alignment: 360 | if str(word) in BUCKEYE_FILTER_LIST: 361 | word.word = pypar.SILENCE 362 | word.phonemes = [ 363 | pypar.Phoneme(pypar.SILENCE, word.start(), word.end())] 364 | 365 | # Deduplicate silence tokens 366 | i = 0 367 | words = alignment.words() 368 | prev_silence = False 369 | while i < len(words): 370 | word = words[i] 371 | if str(word) == pypar.SILENCE: 372 | if prev_silence: 373 | words[i - 1][-1]._end = word.end() 374 | del words[i] 375 | else: 376 | prev_silence = True 377 | i += 1 378 | else: 379 | prev_silence = False 380 | i += 1 381 | 382 | # Save alignment 383 | pypar.Alignment(words).save( 384 | cache_directory / 'alignment' / f'{file.stem}.TextGrid') 385 | 386 | # Get audio files 387 | audio_files = sorted((data_directory / 'audio').glob('*.wav')) 388 | 389 | # Resample audio 390 | for audio_file in audio_files: 391 | 392 | # Load and resample 393 | audio = emphases.load.audio(audio_file) 394 | 395 | # If audio is too quiet, increase the volume 396 | maximum = torch.abs(audio).max() 397 | if maximum < .35: 398 | audio *= .35 / maximum 399 | 400 | # Save to disk 401 | torchaudio.save( 402 | cache_directory / 'audio' / audio_file.name, 403 | audio, 404 | emphases.SAMPLE_RATE) 405 | 406 | # Read buckeye annotations 407 | data_directory = emphases.DATA_DIR / 'buckeye' 408 | with open(data_directory / 'annotations.csv') as file: 409 | reader = csv.DictReader(file) 410 | annotations = [row for row in reader] 411 | 412 | # Extract per-word emphasis scores 413 | alignment_files = (cache_directory / 'alignment').glob('*.TextGrid') 414 | for file in alignment_files: 415 | 416 | # Load alignment 417 | alignment = pypar.Alignment(file) 418 | 419 | # Get words from annotation 420 | words = [word for word in annotations if word['filename'] == file.stem] 421 | words = sorted(words, key=lambda x: float(x['wordmin'])) 422 | 423 | # Get per-word emphasis scores 424 | j = 0 425 | scores = torch.zeros(len(alignment)) 426 | for i, word in enumerate(alignment): 427 | 428 | # Keep silences as zero 429 | if str(word) == pypar.SILENCE: 430 | continue 431 | 432 | # Make sure alignments are aligned 433 | assert str(word).lower() == words[j]['word'].lower() 434 | assert (word.start() - float(words[j]['wordmin'])) < 1e-4 435 | assert (word.end() - float(words[j]['wordmax'])) < 1e-4 436 | 437 | # Update scores 438 | # pa.32 is the average of 32 human judgments of the perception of 439 | # prominence based on acoustic features 440 | scores[i] = float(words[j]['pa.32']) 441 | 442 | j += 1 443 | 444 | # Save scores 445 | torch.save(scores, cache_directory / 'scores' / f'{file.stem}.pt') 446 | 447 | 448 | def libritts(): 449 | """Download libritts dataset""" 450 | # Setup source directory 451 | source_directory = emphases.SOURCE_DIR / 'libritts' 452 | source_directory.mkdir(exist_ok=True, parents=True) 453 | 454 | # Download 455 | url = 'https://us.openslr.org/resources/60/train-clean-100.tar.gz' 456 | file = source_directory / 'libritts-train-clean-100.tar.gz' 457 | torchutil.download.file(url, file) 458 | 459 | # Unzip 460 | with tarfile.open(file, 'r:gz') as tfile: 461 | tfile.extractall(emphases.DATA_DIR) 462 | 463 | # Rename folder 464 | directory = emphases.DATA_DIR / 'libritts' 465 | shutil.rmtree(directory, ignore_errors=True) 466 | shutil.move(emphases.DATA_DIR / 'LibriTTS', directory) 467 | 468 | # Download annotations from zenodo 469 | url = 'https://zenodo.org/records/10402793/files/libritts-emphasis-annotations.json?download=1' 470 | file = source_directory / 'annotations.json' 471 | torchutil.download.file(url, file) 472 | 473 | # Load annotations 474 | with open(source_directory / 'annotations.json') as file: 475 | annotations = json.load(file) 476 | 477 | # Merge annotations to floats 478 | annotations = merge_annotations(annotations) 479 | 480 | # Get list of audio files 481 | audio_files = list(directory.rglob('*.wav')) 482 | audio_files = [ 483 | file for file in audio_files if file.stem in annotations['stems']] 484 | 485 | # Setup cache directory 486 | cache_directory = emphases.CACHE_DIR / 'libritts' 487 | cache_directory.mkdir(exist_ok=True, parents=True) 488 | 489 | # Create subdirectories 490 | features = ['alignment', 'audio', 'scores'] 491 | for feature in features: 492 | (cache_directory / feature).mkdir(exist_ok=True, parents=True) 493 | 494 | # Iterate over files 495 | for audio_file in torchutil.iterator( 496 | audio_files, 497 | 'Formatting libritts', 498 | total=len(audio_files) 499 | ): 500 | 501 | # Load and resample audio 502 | audio = emphases.load.audio(audio_file) 503 | 504 | # If audio is too quiet, increase the volume 505 | maximum = torch.abs(audio).max() 506 | if maximum < .35: 507 | audio *= .35 / maximum 508 | 509 | # Save audio 510 | stem = audio_file.stem 511 | torchaudio.save( 512 | cache_directory / 'audio' / f'{stem}.wav', 513 | audio, 514 | emphases.SAMPLE_RATE) 515 | 516 | # Align text and audio 517 | text_files = [ 518 | file.with_suffix('.normalized.txt') for file in audio_files] 519 | alignment_files = [ 520 | cache_directory / 'alignment' / f'{file.stem}.TextGrid' 521 | for file in audio_files] 522 | pyfoal.from_files_to_files( 523 | text_files, 524 | audio_files, 525 | alignment_files, 526 | 'p2fa') 527 | 528 | for i, stem in enumerate([file.stem for file in audio_files]): 529 | 530 | # Load alignment 531 | alignment = pypar.Alignment( 532 | cache_directory / 'alignment' / f'{stem}.TextGrid') 533 | 534 | # Get ground truth 535 | count = annotations['stems'][stem] 536 | labels = [score / count for score in annotations['scores'][stem]] 537 | 538 | # Match alignment and scores (silences get a score of zero) 539 | j = 0 540 | scores = torch.zeros(len(alignment)) 541 | for i, word in enumerate(alignment): 542 | 543 | # Keep silences as zero 544 | if str(word) == pypar.SILENCE: 545 | continue 546 | 547 | # Update scores 548 | scores[i] = float(labels[j]) 549 | 550 | j += 1 551 | 552 | # Save scores 553 | torch.save(scores, cache_directory / 'scores' / f'{stem}.pt') 554 | 555 | 556 | ############################################################################### 557 | # Utilities 558 | ############################################################################### 559 | 560 | 561 | def download_file(url, file): 562 | """Download file from url""" 563 | with urllib.request.urlopen(url, context=ssl.SSLContext()) as response, \ 564 | open(file, 'wb') as output: 565 | shutil.copyfileobj(response, output) 566 | 567 | 568 | def merge_annotations(annotations): 569 | """Merge crowdsourced annotations""" 570 | merged = {'samples': 0, 'scores': {}, 'stems': {}} 571 | for _, responses in annotations.items(): 572 | 573 | # Iterate over stems 574 | for response in responses['annotations']: 575 | stem = response['stem'] 576 | score = [float(c) for c in list(response['score'])] 577 | 578 | # Merge stem annotations 579 | if stem in merged['stems']: 580 | 581 | # Maybe cap the number of allowed annotations 582 | if ( 583 | emphases.MAX_ANNOTATIONS is not None and 584 | merged['stems'][stem] == emphases.MAX_ANNOTATIONS 585 | ): 586 | continue 587 | 588 | # Update sums and counts 589 | for i in range(len(score)): 590 | merged['scores'][stem][i] += score[i] 591 | merged['stems'][stem] += 1 592 | 593 | # Add new stem 594 | else: 595 | merged['scores'][stem] = score 596 | merged['stems'][stem] = 1 597 | 598 | # Update total number of samples 599 | merged['samples'] += 1 600 | 601 | # Maybe cap the minimum required annotations 602 | if emphases.MIN_ANNOTATIONS is not None: 603 | merged['stems'] = { 604 | stem: count for stem, count in merged['stems'].items() 605 | if count == emphases.MIN_ANNOTATIONS} 606 | merged['scores'] = { 607 | stem: scores for stem, scores in merged['scores'].items() 608 | if stem in merged['stems']} 609 | 610 | return merged 611 | -------------------------------------------------------------------------------- /emphases/data/loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Dataloader 8 | ############################################################################### 9 | 10 | 11 | def loader(dataset, partition=None, gpu=None): 12 | """Retrieve a data loader""" 13 | # Get dataset 14 | dataset = emphases.data.Dataset(dataset, partition) 15 | 16 | # Get sampler 17 | sampler = emphases.data.sampler(dataset, partition) 18 | 19 | # Create loader 20 | return torch.utils.data.DataLoader( 21 | dataset, 22 | num_workers=emphases.NUM_WORKERS, 23 | pin_memory=gpu is not None, 24 | collate_fn=emphases.data.collate, 25 | batch_sampler=sampler) 26 | -------------------------------------------------------------------------------- /emphases/data/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from . import mels 3 | from . import loudness 4 | -------------------------------------------------------------------------------- /emphases/data/preprocess/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Entry point 8 | ############################################################################### 9 | 10 | 11 | def parse_args(): 12 | """Parse command-line arguments""" 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | '--datasets', 16 | nargs='+', 17 | default=emphases.DATASETS, 18 | help='The datasets to preprocess') 19 | parser.add_argument( 20 | '--gpu', 21 | type=int, 22 | help='The index of the gpu to run inference on') 23 | return parser.parse_known_args()[0] 24 | 25 | 26 | if __name__ == '__main__': 27 | emphases.data.preprocess.datasets(**vars(parse_args())) 28 | -------------------------------------------------------------------------------- /emphases/data/preprocess/core.py: -------------------------------------------------------------------------------- 1 | import penn 2 | import torch 3 | import torchutil 4 | 5 | import emphases 6 | 7 | 8 | ############################################################################### 9 | # Preprocess 10 | ############################################################################### 11 | 12 | 13 | @torchutil.notify('preprocess') 14 | def datasets(datasets, gpu=None): 15 | """Preprocess datasets""" 16 | for dataset in datasets: 17 | cache_directory = emphases.CACHE_DIR / dataset 18 | 19 | # Get audio files, from cache 20 | audio_files = sorted(cache_directory.rglob('*.wav')) 21 | 22 | # Preprocess mels 23 | mel_files = [ 24 | cache_directory / 'mels' / f'{file.stem}.pt' 25 | for file in audio_files] 26 | emphases.data.preprocess.mels.from_files_to_files( 27 | audio_files, 28 | mel_files) 29 | 30 | # Preprocess loudness 31 | loudness_files = [ 32 | cache_directory / 'loudness' / f'{file.stem}.pt' 33 | for file in audio_files] 34 | emphases.data.preprocess.loudness.from_files_to_files( 35 | audio_files, 36 | loudness_files) 37 | 38 | # Preprocess pitch, periodicity 39 | (cache_directory / 'pitch').mkdir(exist_ok=True, parents=True) 40 | pitch_files = [ 41 | cache_directory / 'pitch' / f'{file.stem}' 42 | for file in audio_files] 43 | penn.from_files_to_files( 44 | audio_files, 45 | pitch_files, 46 | hopsize=emphases.convert.samples_to_seconds(emphases.HOPSIZE), 47 | fmin=emphases.FMIN, 48 | fmax=emphases.FMAX, 49 | batch_size=2048, 50 | center='half-hop', 51 | interp_unvoiced_at=emphases.VOICED_THRESHOLD, 52 | num_workers=emphases.NUM_WORKERS, 53 | gpu=gpu) 54 | 55 | # Pitch and periodicity use floating-point hopsize, while mels and 56 | # loudness use an integer hopsize in samples. This results in 57 | # single-frame differences when the audio length is within one sample 58 | # of a new frame due to floating-point error. We simply remove the last 59 | # frame in this rare case. 60 | for loudness_file, pitch_file in zip(loudness_files, pitch_files): 61 | loudness = torch.load(loudness_file) 62 | pitch = torch.load(f'{pitch_file}-pitch.pt') 63 | periodicity = torch.load(f'{pitch_file}-periodicity.pt') 64 | if pitch.shape[1] == loudness.shape[1] + 1: 65 | pitch = pitch[:, :-1] 66 | periodicity = periodicity[:, :-1] 67 | torch.save(pitch, f'{pitch_file}-pitch.pt') 68 | torch.save(periodicity, f'{pitch_file}-periodicity.pt') 69 | 70 | 71 | def from_audio(audio, gpu=None): 72 | """Preprocess one audio file""" 73 | # Move to device (no-op if devices are the same) 74 | audio = audio.to('cpu' if gpu is None else f'cuda:{gpu}') 75 | 76 | features = [] 77 | 78 | # Preprocess mels 79 | if emphases.MEL_FEATURE: 80 | features.append(emphases.data.preprocess.mels.from_audio(audio)) 81 | 82 | # Preprocess pitch and periodicity 83 | if emphases.PITCH_FEATURE or emphases.PERIODICITY_FEATURE: 84 | pitch, periodicity = penn.from_audio( 85 | audio, 86 | emphases.SAMPLE_RATE, 87 | hopsize=emphases.convert.samples_to_seconds(emphases.HOPSIZE), 88 | fmin=emphases.FMIN, 89 | fmax=emphases.FMAX, 90 | pad=True, 91 | interp_unvoiced_at=emphases.VOICED_THRESHOLD, 92 | gpu=gpu) 93 | 94 | if emphases.PITCH_FEATURE: 95 | if emphases.NORMALIZE: 96 | features.append( 97 | (torch.log2(pitch) - emphases.LOGFMIN) / 98 | (emphases.LOGFMAX - emphases.LOGFMIN)) 99 | else: 100 | features.append(torch.log2(pitch)) 101 | 102 | if emphases.PERIODICITY_FEATURE: 103 | features.append(periodicity) 104 | 105 | # Pitch and periodicity use floating-point hopsize, while mels and 106 | # loudness use an integer hopsize in samples. This results in 107 | # single-frame differences when the audio length is within one sample 108 | # of a new frame due to floating-point error. We simply repeat the last 109 | # frame in this rare case. 110 | frames = emphases.convert.samples_to_frames(audio.shape[-1]) 111 | if pitch.shape[1] == frames + 1: 112 | pitch = pitch[:, :-1] 113 | periodicity = periodicity[:, :-1] 114 | 115 | # Preprocess loudness 116 | if emphases.LOUDNESS_FEATURE: 117 | loudness = emphases.data.preprocess.loudness.from_audio( 118 | audio, 119 | emphases.SAMPLE_RATE) 120 | features.append(loudness.to(audio.device)) 121 | 122 | # Concatenate features 123 | features = features[0] if len(features) == 1 else torch.cat(features) 124 | 125 | return features[None] 126 | -------------------------------------------------------------------------------- /emphases/data/preprocess/loudness.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import warnings 3 | 4 | import librosa 5 | import numpy as np 6 | import penn 7 | import torch 8 | import torchutil 9 | 10 | import emphases 11 | 12 | 13 | ############################################################################### 14 | # Interface 15 | ############################################################################### 16 | 17 | 18 | def from_audio(audio, sample_rate=emphases.SAMPLE_RATE): 19 | """Compute mels from audio""" 20 | # Mayble resample 21 | audio = emphases.resample(audio, sample_rate) 22 | 23 | # Compute loudness 24 | return a_weighted(audio, sample_rate, hop_length=emphases.HOPSIZE) 25 | 26 | 27 | def from_file(audio_file): 28 | """Load audio and compute mels""" 29 | audio = emphases.load.audio(audio_file) 30 | 31 | # Compute loudness 32 | return from_audio(audio) 33 | 34 | 35 | def from_file_to_file(audio_file, output_file): 36 | """Compute loudness from audio file and save to disk""" 37 | loudness = from_file(audio_file) 38 | 39 | # Save to disk 40 | output_file.parent.mkdir(exist_ok=True, parents=True) 41 | torch.save(loudness, output_file) 42 | 43 | 44 | def from_files_to_files(audio_files, output_files): 45 | """Compute loudness for many files and save to disk""" 46 | torchutil.multiprocess_iterator( 47 | wrapper, 48 | zip(audio_files, output_files), 49 | 'Preprocessing a-weighted loudness', 50 | total=len(audio_files), 51 | num_workers=emphases.NUM_WORKERS) 52 | 53 | 54 | ############################################################################### 55 | # Loudness 56 | ############################################################################### 57 | 58 | 59 | def a_weighted(audio, sample_rate, hop_length=None, pad=False): 60 | """Retrieve the per-frame loudness""" 61 | # Save device 62 | device = audio.device 63 | 64 | # Default hop length of 10 ms 65 | hop_length = sample_rate // 100 if hop_length is None else hop_length 66 | 67 | if audio.dim() == 2: 68 | audio = audio[:, None, :] 69 | elif audio.dim() == 1: 70 | audio = audio[None, None, :] 71 | 72 | # Pad audio 73 | p = (emphases.NUM_FFT - emphases.HOPSIZE) // 2 74 | audio = torch.nn.functional.pad(audio, (p, p), "reflect").squeeze(1) 75 | 76 | # Convert to numpy 77 | audio = audio.detach().cpu().numpy().squeeze(0) 78 | 79 | # Cache weights 80 | if not hasattr(a_weighted, 'weights'): 81 | a_weighted.weights = perceptual_weights() 82 | 83 | # Take stft 84 | stft = librosa.stft( 85 | audio, 86 | n_fft=penn.WINDOW_SIZE, 87 | hop_length=hop_length, 88 | win_length=penn.WINDOW_SIZE, 89 | center=pad, 90 | pad_mode='constant') 91 | 92 | # Compute magnitude on db scale 93 | db = librosa.amplitude_to_db(np.abs(stft)) 94 | 95 | # Apply A-weighting 96 | weighted = db + a_weighted.weights 97 | 98 | # Threshold 99 | weighted[weighted < emphases.MIN_DB] = emphases.MIN_DB 100 | 101 | # Average over weighted frequencies 102 | loudness = torch.from_numpy(weighted.mean(axis=0)).float().to(device)[None] 103 | 104 | # Scale to roughly [0, 1] 105 | if emphases.NORMALIZE: 106 | return (loudness + 100.) / 100. 107 | return loudness 108 | 109 | 110 | def perceptual_weights(): 111 | """A-weighted frequency-dependent perceptual loudness weights""" 112 | frequencies = librosa.fft_frequencies( 113 | sr=penn.SAMPLE_RATE, 114 | n_fft=penn.WINDOW_SIZE) 115 | 116 | # A warning is raised for nearly inaudible frequencies, but it ends up 117 | # defaulting to -100 db. That default is fine for our purposes. 118 | with warnings.catch_warnings(): 119 | warnings.simplefilter('ignore', RuntimeWarning) 120 | return librosa.A_weighting(frequencies)[:, None] - emphases.REF_DB 121 | 122 | def wrapper(item): 123 | """Multiprocessing wrapper""" 124 | from_file_to_file(*item) 125 | -------------------------------------------------------------------------------- /emphases/data/preprocess/mels.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import os 3 | 4 | import librosa 5 | import torch 6 | import torchutil 7 | 8 | import emphases 9 | 10 | 11 | ############################################################################### 12 | # Mel spectrogram 13 | ############################################################################### 14 | 15 | 16 | def from_audio(audio): 17 | """Compute spectrogram from audio""" 18 | # Cache hann window 19 | if ( 20 | not hasattr(from_audio, 'window') or 21 | from_audio.dtype != audio.dtype or 22 | from_audio.device != audio.device 23 | ): 24 | from_audio.window = torch.hann_window( 25 | emphases.WINDOW_SIZE, 26 | dtype=audio.dtype, 27 | device=audio.device) 28 | from_audio.dtype = audio.dtype 29 | from_audio.device = audio.device 30 | 31 | # Pad audio 32 | size = (emphases.NUM_FFT - emphases.HOPSIZE) // 2 33 | audio = torch.nn.functional.pad( 34 | audio, 35 | (size, size), 36 | mode='reflect') 37 | 38 | # Compute stft 39 | stft = torch.stft( 40 | audio.squeeze(1), 41 | emphases.NUM_FFT, 42 | hop_length=emphases.HOPSIZE, 43 | window=from_audio.window, 44 | center=False, 45 | normalized=False, 46 | onesided=True, 47 | return_complex=True) 48 | stft = torch.view_as_real(stft)[0] 49 | 50 | # Compute magnitude 51 | spectrogram = torch.sqrt(stft.pow(2).sum(-1) + 1e-6) 52 | 53 | # Convert to mels 54 | mels = linear_to_mel(spectrogram) 55 | 56 | # Scale to roughly [0, 1] 57 | if emphases.NORMALIZE: 58 | return (mels + 10.) / 10. 59 | return mels 60 | 61 | 62 | def from_file(audio_file): 63 | """Load audio and compute mels""" 64 | audio = emphases.load.audio(audio_file) 65 | 66 | # Compute mels 67 | return from_audio(audio) 68 | 69 | 70 | def from_file_to_file(audio_file, output_file): 71 | """Compute mels from audio file and save to disk""" 72 | mels = from_file(audio_file) 73 | 74 | # Save to disk 75 | output_file.parent.mkdir(exist_ok=True, parents=True) 76 | torch.save(mels, output_file) 77 | 78 | 79 | def from_files_to_files(audio_files, output_files): 80 | """Compute mels for many files and save to disk""" 81 | torchutil.multiprocess_iterator( 82 | wrapper, 83 | zip(audio_files, output_files), 84 | 'Preprocessing mels', 85 | total=len(audio_files), 86 | num_workers=emphases.NUM_WORKERS) 87 | 88 | 89 | ############################################################################### 90 | # Utilities 91 | ############################################################################### 92 | 93 | 94 | def linear_to_mel(spectrogram): 95 | # Create mel basis 96 | if not hasattr(linear_to_mel, 'mel_basis'): 97 | basis = librosa.filters.mel( 98 | sr=emphases.SAMPLE_RATE, 99 | n_fft=emphases.NUM_FFT, 100 | n_mels=emphases.NUM_MELS) 101 | basis = torch.from_numpy(basis) 102 | basis = basis.to(spectrogram.dtype).to(spectrogram.device) 103 | linear_to_mel.basis = basis 104 | 105 | # Convert to mels 106 | melspectrogram = torch.matmul(linear_to_mel.basis, spectrogram) 107 | 108 | # Apply dynamic range compression 109 | return torch.log(torch.clamp(melspectrogram, min=1e-5)) 110 | 111 | def wrapper(item): 112 | """Multiprocessing wrapper""" 113 | from_file_to_file(*item) 114 | -------------------------------------------------------------------------------- /emphases/data/sampler.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Sampler selection 8 | ############################################################################### 9 | 10 | 11 | def sampler(dataset, partition): 12 | """Create batch sampler""" 13 | # Deterministic random sampler for training 14 | if partition in ['train', 'valid']: 15 | return Sampler(dataset) 16 | 17 | # Sample validation and test data sequentially 18 | elif partition.startswith('test'): 19 | return torch.utils.data.BatchSampler( 20 | torch.utils.data.SequentialSampler(dataset), 21 | 1, 22 | False) 23 | 24 | else: 25 | raise ValueError(f'Partition {partition} is not defined') 26 | 27 | 28 | ############################################################################### 29 | # Samplers 30 | ############################################################################### 31 | 32 | 33 | class Sampler: 34 | 35 | def __init__(self, dataset, max_frames=emphases.MAX_TRAINING_FRAMES): 36 | self.max_frames = max_frames 37 | self.epoch = 0 38 | self.length = len(dataset) 39 | self.buckets = dataset.buckets() 40 | 41 | def __iter__(self): 42 | return iter(self.batch()) 43 | 44 | def __len__(self): 45 | return len(self.batch()) 46 | 47 | def batch(self): 48 | """Produces batch indices for one epoch""" 49 | # Deterministic shuffling based on epoch 50 | generator = torch.Generator() 51 | generator.manual_seed(emphases.RANDOM_SEED + self.epoch) 52 | 53 | # Iterate over length-partitioned buckets 54 | batches = [] 55 | for bucket in self.buckets: 56 | 57 | # Shuffle bucket 58 | bucket = bucket[ 59 | torch.randperm(len(bucket), generator=generator).tolist()] 60 | 61 | # Variable batch size 62 | batch = [] 63 | max_length = 0 64 | for index, length in bucket: 65 | max_length = max(max_length, length) 66 | if ( 67 | batch and 68 | (len(batch) + 1) * max_length > self.max_frames 69 | ): 70 | batches.append(batch) 71 | max_length = length 72 | batch = [index] 73 | else: 74 | batch.append(index) 75 | 76 | # Don't drop last batch 77 | if batch: 78 | batches.append(batch) 79 | 80 | # Shuffle 81 | return [ 82 | batches[i] for i in 83 | torch.randperm(len(batches), generator=generator).tolist()] 84 | 85 | def set_epoch(self, epoch): 86 | self.epoch = epoch 87 | -------------------------------------------------------------------------------- /emphases/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from . import metrics 3 | from .metrics import Metrics 4 | -------------------------------------------------------------------------------- /emphases/evaluate/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import emphases 5 | 6 | 7 | ############################################################################### 8 | # Entry point 9 | ############################################################################### 10 | 11 | 12 | def parse_args(): 13 | """Parse command-line arguments""" 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument( 16 | '--datasets', 17 | nargs='+', 18 | default=emphases.EVALUATION_DATASETS, 19 | help='The datasets to evaluate') 20 | parser.add_argument( 21 | '--checkpoint', 22 | type=Path, 23 | help='The checkpoint file to evaluate') 24 | parser.add_argument( 25 | '--gpu', 26 | type=int, 27 | help='The index of the GPU to use for evaluation') 28 | 29 | return parser.parse_known_args()[0] 30 | 31 | 32 | if __name__ == '__main__': 33 | emphases.evaluate.datasets(**vars(parse_args())) 34 | -------------------------------------------------------------------------------- /emphases/evaluate/core.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import torch 4 | import torchutil 5 | 6 | import emphases 7 | 8 | 9 | ############################################################################### 10 | # Evaluate 11 | ############################################################################### 12 | 13 | 14 | @torchutil.notify('evaluate') 15 | def datasets(datasets, checkpoint=None, gpu=None): 16 | """Perform evaluation""" 17 | device = torch.device('cpu' if gpu is None else f'cuda:{gpu}') 18 | 19 | # Containers for results 20 | overall, granular = {}, {} 21 | 22 | # Evaluate each dataset 23 | for dataset in datasets: 24 | 25 | # Get data loader 26 | loader = emphases.data.loader(dataset, 'test', gpu) 27 | 28 | # Get mean and variance for Pearson Correlation 29 | target_stats = emphases.evaluate.metrics.Statistics() 30 | predicted_stats = emphases.evaluate.metrics.Statistics() 31 | for batch in loader: 32 | 33 | # Unpack 34 | _, _, _, word_lengths, targets, alignments, audio, _ = batch 35 | 36 | # Get predicted scores 37 | scores = emphases.from_alignment_and_audio( 38 | alignments[0], 39 | audio[0], 40 | emphases.SAMPLE_RATE, 41 | checkpoint=checkpoint, 42 | gpu=gpu) 43 | 44 | # Update statistics 45 | target_stats.update(targets, word_lengths) 46 | predicted_stats.update(scores[None], word_lengths) 47 | 48 | # Get metric class 49 | metric_fn = emphases.evaluate.Metrics 50 | 51 | # Per-file metrics 52 | file_metrics = metric_fn(predicted_stats, target_stats) 53 | 54 | # Per-dataset metrics 55 | dataset_metrics = metric_fn(predicted_stats, target_stats) 56 | 57 | # Iterate over test set 58 | for batch in torchutil.iterator( 59 | loader, 60 | f'Evaluating {emphases.CONFIG} on {dataset}', 61 | total=len(loader) 62 | ): 63 | 64 | # Unpack 65 | ( 66 | _, 67 | frame_lengths, 68 | word_bounds, 69 | word_lengths, 70 | targets, 71 | alignments, 72 | audio, 73 | stems 74 | ) = batch 75 | 76 | # Reset file metrics 77 | file_metrics.reset() 78 | 79 | if emphases.METHOD == 'neural': 80 | 81 | # Get predicted scores 82 | scores = [] 83 | 84 | # Preprocess audio 85 | for features, word_bounds in emphases.preprocess( 86 | alignments[0], 87 | audio[0], 88 | gpu=gpu 89 | ): 90 | 91 | # Infer 92 | logits = emphases.infer( 93 | features, 94 | word_bounds, 95 | checkpoint).detach() 96 | 97 | # Skip postprocessing 98 | scores.append(logits) 99 | 100 | # Concatenate results 101 | scores = torch.cat(scores, 2) 102 | 103 | else: 104 | 105 | # Baseline method inference 106 | scores = emphases.from_alignment_and_audio( 107 | alignments[0], 108 | audio[0], 109 | emphases.SAMPLE_RATE, 110 | gpu=gpu)[None] 111 | 112 | # Update metrics 113 | args = (scores, targets.to(device), word_lengths.to(device)) 114 | file_metrics.update(*args) 115 | dataset_metrics.update(*args) 116 | 117 | # Copy results 118 | granular[f'{dataset}/{stems[0]}'] = file_metrics() 119 | overall[dataset] = dataset_metrics() 120 | 121 | # Write to json files 122 | directory = emphases.EVAL_DIR / emphases.CONFIG 123 | directory.mkdir(exist_ok=True, parents=True) 124 | with open(directory / 'overall.json', 'w') as file: 125 | json.dump(overall, file, indent=4) 126 | with open(directory / 'granular.json', 'w') as file: 127 | json.dump(granular, file, indent=4) 128 | -------------------------------------------------------------------------------- /emphases/evaluate/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchutil 3 | 4 | import emphases 5 | 6 | 7 | ############################################################################### 8 | # Aggregate metric 9 | ############################################################################### 10 | 11 | 12 | class Metrics: 13 | 14 | def __init__(self, predicted_stats, target_stats): 15 | self.correlation = torchutil.metrics.PearsonCorrelation( 16 | *predicted_stats(), 17 | *target_stats()) 18 | self.bce = BinaryCrossEntropy() 19 | self.mse = MeanSquaredError() 20 | 21 | def __call__(self): 22 | return { 23 | 'pearson_correlation': self.correlation(), 24 | 'bce': self.bce(), 25 | 'mse': self.mse()} 26 | 27 | def update( 28 | self, 29 | logits, 30 | targets, 31 | word_lengths): 32 | # Detach from graph 33 | logits = logits.detach() 34 | 35 | # Word resolution sequence mask 36 | mask = emphases.model.mask_from_lengths(word_lengths) 37 | logits, targets = logits[mask], targets[mask] 38 | 39 | # Update cross entropy 40 | self.bce.update(logits, targets) 41 | 42 | # Update squared error 43 | self.mse.update(emphases.postprocess(logits), targets) 44 | 45 | # Update pearson correlation 46 | self.correlation.update(emphases.postprocess(logits), targets) 47 | 48 | def reset(self): 49 | self.correlation.reset() 50 | self.bce.reset() 51 | self.mse.reset() 52 | 53 | 54 | ############################################################################### 55 | # Individual metrics 56 | ############################################################################### 57 | 58 | 59 | class BinaryCrossEntropy(torchutil.metrics.Average): 60 | 61 | def update(self, scores, targets): 62 | if emphases.LOSS == 'bce': 63 | 64 | # Get values from logits 65 | values = torch.nn.functional.binary_cross_entropy_with_logits( 66 | scores, 67 | targets, 68 | reduction='none') 69 | 70 | else: 71 | 72 | # Get values from probabilities 73 | x, y = torch.clamp(scores, 0., 1.), targets 74 | values = -( 75 | y * torch.log(x + 1e-6) + (1 - y) * torch.log(1 - x + 1e-6)) 76 | 77 | # Update 78 | super().update(values, values.numel()) 79 | 80 | 81 | # TODO - fix scaling 82 | class MeanSquaredError(torchutil.metrics.Average): 83 | 84 | def update( 85 | self, 86 | scores, 87 | targets): 88 | # Compute sum of MSE 89 | values = torch.nn.functional.mse_loss( 90 | scores, 91 | targets, 92 | reduction='none') 93 | 94 | # Update 95 | super().update(values, values.numel()) 96 | 97 | 98 | ############################################################################### 99 | # Utilities 100 | ############################################################################### 101 | 102 | 103 | class Statistics(torchutil.metrics.MeanStd): 104 | 105 | def update(self, values, lengths): 106 | # Sequence mask 107 | mask = emphases.model.mask_from_lengths(lengths) 108 | 109 | # Update 110 | super().update(values[mask].flatten().tolist()) 111 | -------------------------------------------------------------------------------- /emphases/load.py: -------------------------------------------------------------------------------- 1 | import torchaudio 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Loading utilities 8 | ############################################################################### 9 | 10 | 11 | def audio(file): 12 | """Load audio and maybe resample""" 13 | # Load 14 | audio, sample_rate = torchaudio.load(file) 15 | 16 | # Maybe resample 17 | return emphases.resample(audio, sample_rate) 18 | -------------------------------------------------------------------------------- /emphases/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from .layers import Layers 3 | 4 | import emphases 5 | -------------------------------------------------------------------------------- /emphases/model/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Model definition 8 | ############################################################################### 9 | 10 | 11 | class Model(torch.nn.Module): 12 | 13 | def __init__(self): 14 | super().__init__() 15 | 16 | # Input projection 17 | self.input_layer = torch.nn.Conv1d( 18 | emphases.NUM_FEATURES, 19 | emphases.CHANNELS, 20 | kernel_size=emphases.ENCODER_KERNEL_SIZE, 21 | padding='same') 22 | 23 | # Frame encoder 24 | self.frame_encoder = emphases.model.Layers( 25 | kernel_size=emphases.ENCODER_KERNEL_SIZE) 26 | 27 | # If we are resampling within the model, initialize word decoder 28 | if emphases.DOWNSAMPLE_LOCATION in ['input', 'intermediate']: 29 | self.word_decoder = emphases.model.Layers( 30 | kernel_size=emphases.DECODER_KERNEL_SIZE) 31 | 32 | # Output projection 33 | self.output_layer = torch.nn.Conv1d( 34 | emphases.CHANNELS, 35 | 1, 36 | kernel_size=emphases.DECODER_KERNEL_SIZE, 37 | padding='same') 38 | 39 | def forward(self, features, frame_lengths, word_bounds, word_lengths): 40 | 41 | if emphases.DOWNSAMPLE_LOCATION == 'input': 42 | 43 | # Segment acoustic features into word segments 44 | segments, bounds, lengths = emphases.segment( 45 | features, 46 | word_bounds, 47 | word_lengths) 48 | 49 | # Embed frames 50 | frame_embeddings = self.frame_encoder( 51 | self.input_layer(segments), 52 | lengths) 53 | 54 | # Downsample 55 | if emphases.DOWNSAMPLE_METHOD == 'average': 56 | word_embeddings = frame_embeddings.mean(dim=2, keepdim=True) 57 | elif emphases.DOWNSAMPLE_METHOD == 'max': 58 | word_embeddings = frame_embeddings.max( 59 | dim=2, 60 | keepdim=True 61 | ).values 62 | elif emphases.DOWNSAMPLE_METHOD == 'sum': 63 | word_embeddings = frame_embeddings.sum(dim=2, keepdim=True) 64 | elif emphases.DOWNSAMPLE_METHOD == 'center': 65 | word_embeddings = emphases.downsample( 66 | frame_embeddings, 67 | bounds, 68 | torch.ones( 69 | (len(lengths),), 70 | dtype=torch.long, 71 | device=lengths.device)) 72 | else: 73 | raise ValueError( 74 | f'Interpolation method {emphases.DOWNSAMPLE_METHOD} is not defined') 75 | 76 | # Stitch together word segment embeddings 77 | mask = mask_from_lengths(word_lengths) 78 | word_embeddings = word_embeddings.squeeze(2).transpose(0, 1).reshape( 79 | word_embeddings.shape[1], 80 | word_bounds.shape[0], 81 | word_bounds.shape[2] 82 | ).permute(1, 0, 2) * mask 83 | 84 | # Decode 85 | word_embeddings = self.word_decoder( 86 | word_embeddings, 87 | word_lengths) 88 | 89 | else: 90 | 91 | # Embed frames 92 | frame_embeddings = self.frame_encoder( 93 | self.input_layer(features), 94 | frame_lengths) 95 | 96 | if emphases.DOWNSAMPLE_LOCATION == 'intermediate': 97 | 98 | # Downsample activations to word resolution 99 | word_embeddings = emphases.downsample( 100 | frame_embeddings, 101 | word_bounds, 102 | word_lengths) 103 | 104 | # Infer emphasis scores from word embeddings 105 | word_embeddings = self.word_decoder( 106 | word_embeddings, 107 | word_lengths) 108 | 109 | elif emphases.DOWNSAMPLE_LOCATION == 'loss': 110 | 111 | # Downsample activations to word resolution 112 | word_embeddings = emphases.downsample( 113 | frame_embeddings, 114 | word_bounds, 115 | word_lengths) 116 | 117 | elif emphases.DOWNSAMPLE_LOCATION == 'inference': 118 | 119 | if self.training: 120 | 121 | # Return frame resolution prominence for framewise loss 122 | return self.output_layer(frame_embeddings) 123 | 124 | else: 125 | 126 | # Downsample activations to word resolution 127 | word_embeddings = emphases.downsample( 128 | frame_embeddings, 129 | word_bounds, 130 | word_lengths) 131 | 132 | else: 133 | raise ValueError( 134 | f'Downsample location {emphases.DOWNSAMPLE_LOCATION} ' + 135 | 'not recognized') 136 | 137 | # Project to scalar 138 | return self.output_layer(word_embeddings) 139 | 140 | 141 | ############################################################################### 142 | # Utilities 143 | ############################################################################### 144 | 145 | 146 | def mask_from_lengths(lengths): 147 | """Create boolean mask from sequence lengths""" 148 | x = torch.arange(lengths.max(), dtype=lengths.dtype, device=lengths.device) 149 | return (x.unsqueeze(0) < lengths.unsqueeze(1)).unsqueeze(1) 150 | -------------------------------------------------------------------------------- /emphases/model/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .convolution import Convolution 2 | from .transformer import Transformer 3 | 4 | import emphases 5 | 6 | 7 | def Layers(**kwargs): 8 | if emphases.ARCHITECTURE == 'convolution': 9 | return Convolution(**kwargs) 10 | elif emphases.ARCHITECTURE == 'transformer': 11 | return Transformer() 12 | else: 13 | raise ValueError( 14 | f'Network layer {emphases.ARCHITECTURE} is not defined') 15 | -------------------------------------------------------------------------------- /emphases/model/layers/convolution.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import torch 4 | 5 | import emphases 6 | 7 | 8 | ############################################################################### 9 | # Convolution model 10 | ############################################################################### 11 | 12 | 13 | class Convolution(torch.nn.Sequential): 14 | 15 | def __init__(self, kernel_size=emphases.ENCODER_KERNEL_SIZE): 16 | # Bind common parameters 17 | conv_fn = functools.partial( 18 | torch.nn.Conv1d, 19 | kernel_size=kernel_size, 20 | padding='same') 21 | 22 | # Layers 23 | layers = [] 24 | channels = emphases.CHANNELS 25 | for _ in range(emphases.LAYERS): 26 | layers.extend(( 27 | conv_fn(channels, channels), 28 | emphases.ACTIVATION_FUNCTION())) 29 | if emphases.DROPOUT is not None: 30 | layers.append(torch.nn.Dropout(emphases.DROPOUT)) 31 | 32 | # Register to Module 33 | super().__init__(*layers) 34 | 35 | # Ignore sequence length parameter needed for Transformer model 36 | def forward(self, x, _): 37 | return super().forward(x) 38 | -------------------------------------------------------------------------------- /emphases/model/layers/transformer.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | 5 | import emphases 6 | 7 | 8 | ############################################################################### 9 | # Transformer stack 10 | ############################################################################### 11 | 12 | 13 | class Transformer(torch.nn.Module): 14 | 15 | def __init__(self, num_layers=emphases.LAYERS, channels=emphases.CHANNELS): 16 | super().__init__() 17 | self.position = PositionalEncoding(channels, .1) 18 | self.model = torch.nn.TransformerEncoder( 19 | torch.nn.TransformerEncoderLayer( 20 | channels, 21 | 2, 22 | dim_feedforward=emphases.CHANNELS), 23 | num_layers) 24 | 25 | def forward(self, x, lengths): 26 | mask = emphases.model.mask_from_lengths(lengths) 27 | return self.model( 28 | self.position(x.permute(2, 0, 1)), 29 | src_key_padding_mask=~mask.squeeze(1) 30 | ).permute(1, 2, 0) 31 | 32 | 33 | ############################################################################### 34 | # Utilities 35 | ############################################################################### 36 | 37 | 38 | class PositionalEncoding(torch.nn.Module): 39 | 40 | def __init__(self, channels, dropout=.1, max_len=5000): 41 | super().__init__() 42 | self.dropout = torch.nn.Dropout(p=dropout) 43 | index = torch.arange(max_len).unsqueeze(1) 44 | frequency = torch.exp( 45 | torch.arange(0, channels, 2) * (-math.log(10000.0) / channels)) 46 | encoding = torch.zeros(max_len, 1, channels) 47 | encoding[:, 0, 0::2] = torch.sin(index * frequency) 48 | encoding[:, 0, 1::2] = torch.cos(index * frequency) 49 | self.register_buffer('encoding', encoding) 50 | 51 | def forward(self, x): 52 | return self.dropout(x + self.encoding[:x.size(0)]) 53 | -------------------------------------------------------------------------------- /emphases/partition/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | -------------------------------------------------------------------------------- /emphases/partition/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import emphases 4 | 5 | 6 | def parse_args(): 7 | """Parse command-line arguments""" 8 | parser = argparse.ArgumentParser(description='Partition datasets') 9 | parser.add_argument( 10 | '--datasets', 11 | nargs='+', 12 | default=emphases.DATASETS, 13 | help='The datasets to partition') 14 | return parser.parse_known_args()[0] 15 | 16 | 17 | if __name__ == '__main__': 18 | emphases.partition.datasets(**vars(parse_args())) 19 | -------------------------------------------------------------------------------- /emphases/partition/core.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import emphases 4 | 5 | 6 | ############################################################################### 7 | # Partition dataset 8 | ############################################################################### 9 | 10 | 11 | def datasets(datasets): 12 | """Partition datasets""" 13 | for dataset in datasets: 14 | 15 | # Check if partition already exists 16 | file = emphases.PARTITION_DIR / f'{dataset}.json' 17 | 18 | # Random seed 19 | random.seed(emphases.RANDOM_SEED) 20 | 21 | # Make partition 22 | if dataset == 'automatic': 23 | partition = automatic() 24 | elif dataset == 'buckeye': 25 | partition = buckeye() 26 | elif dataset == 'libritts': 27 | partition = libritts() 28 | elif dataset == 'crowdsource': 29 | partition = crowdsource() 30 | else: 31 | raise ValueError(f'Dataset {dataset} is not defined') 32 | 33 | # Save to disk 34 | file.parent.mkdir(exist_ok=True, parents=True) 35 | with open(file, 'w') as file: 36 | json.dump(partition, file, ensure_ascii=False, indent=4) 37 | 38 | 39 | ############################################################################### 40 | # Existing datasets 41 | ############################################################################### 42 | 43 | 44 | def buckeye(): 45 | """Partition buckeye dataset""" 46 | # Get audio files 47 | directory = emphases.CACHE_DIR / 'buckeye' 48 | audio_files = directory.rglob('*.wav') 49 | 50 | # Get stems 51 | stems = [file.stem for file in audio_files] 52 | 53 | # Partition 54 | return {'train': [], 'valid': [], 'test': stems} 55 | 56 | 57 | def libritts(): 58 | """Partition libritts dataset""" 59 | # Get audio files 60 | directory = emphases.CACHE_DIR / 'libritts' 61 | audio_files = directory.rglob('*.wav') 62 | 63 | # Get stems 64 | stems = [file.stem for file in audio_files] 65 | 66 | # Shuffle stems 67 | random.seed(emphases.RANDOM_SEED) 68 | random.shuffle(stems) 69 | 70 | # Get split locations 71 | left = int(emphases.SPLIT_SIZE_TRAIN * len(stems)) 72 | right = left + int(emphases.SPLIT_SIZE_VALID * len(stems)) 73 | 74 | # Only train on specified eighth for scaling law experiments 75 | if emphases.ONE_EIGHTH_UTTERANCES: 76 | 77 | # Partition 78 | speakers = [str(s) for s in emphases.data.download.LIBRITTS_SPEAKERS] 79 | train = [stem for stem in stems if stem.split('_')[0] in speakers] 80 | valid = [stem for stem in stems[left:right] if stem not in train] 81 | test = [stem for stem in stems[right:] if stem not in train] 82 | 83 | else: 84 | 85 | # Partition 86 | train = stems[:left] 87 | valid = stems[left:right] 88 | test = stems[right:] 89 | 90 | # Maybe limit training set size 91 | if emphases.MAX_TRAINING_UTTERANCES is not None: 92 | train = train[:emphases.MAX_TRAINING_UTTERANCES] 93 | 94 | return {'train': train, 'valid': valid, 'test': test} 95 | 96 | 97 | ############################################################################### 98 | # Dataset creation 99 | ############################################################################### 100 | 101 | 102 | def automatic(): 103 | """Partition dataset created from trained model""" 104 | # Get audio files 105 | directory = emphases.CACHE_DIR / 'automatic' 106 | audio_files = directory.rglob('*.wav') 107 | 108 | # Get stems 109 | stems = [file.stem for file in audio_files] 110 | 111 | # Shuffle stems 112 | random.seed(emphases.RANDOM_SEED) 113 | random.shuffle(stems) 114 | 115 | # Get split locations 116 | left = int(emphases.SPLIT_SIZE_TRAIN * len(stems)) 117 | right = left + int(emphases.SPLIT_SIZE_VALID * len(stems)) 118 | 119 | # Partition 120 | return { 121 | 'train': stems[:left], 122 | 'valid': stems[left:right], 123 | 'test': stems[right:]} 124 | 125 | 126 | def crowdsource(): 127 | """Partition crowdsourced dataset""" 128 | # Get audio files 129 | directory = emphases.CACHE_DIR / 'crowdsource' 130 | audio_files = directory.rglob('*.wav') 131 | 132 | # Get stems 133 | stems = [file.stem for file in audio_files] 134 | 135 | # Shuffle stems 136 | random.seed(emphases.RANDOM_SEED) 137 | random.shuffle(stems) 138 | 139 | # Get split locations 140 | left = int(emphases.SPLIT_SIZE_TRAIN * len(stems)) 141 | right = left + int(emphases.SPLIT_SIZE_VALID * len(stems)) 142 | 143 | # Only train on specified eighth for scaling law experiments 144 | if emphases.ONE_EIGHTH_UTTERANCES: 145 | 146 | # Partition 147 | speakers = [str(s) for s in emphases.data.download.LIBRITTS_SPEAKERS] 148 | train = [stem for stem in stems if stem.split('_')[0] in speakers] 149 | valid = [stem for stem in stems[left:right] if stem not in train] 150 | test = [stem for stem in stems[right:] if stem not in train] 151 | 152 | else: 153 | 154 | # Partition 155 | train = stems[:left] 156 | valid = stems[left:right] 157 | test = stems[right:] 158 | 159 | # Maybe limit training set size 160 | if emphases.MAX_TRAINING_UTTERANCES is not None: 161 | train = train[:emphases.MAX_TRAINING_UTTERANCES] 162 | 163 | return {'train': train, 'valid': valid, 'test': test} 164 | -------------------------------------------------------------------------------- /emphases/plot/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from . import scaling 3 | -------------------------------------------------------------------------------- /emphases/plot/core.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | import torch 4 | 5 | 6 | ############################################################################### 7 | # Plot prominence 8 | ############################################################################### 9 | 10 | 11 | def scores(alignment, scores, targets=None): 12 | """Plot the aligned prominence scores""" 13 | figure, axis = plt.subplots(figsize=(30, 5)) 14 | axis.set_axis_off() 15 | axis.set_ylim([0., 1.]) 16 | 17 | # Get words, start times, and durations 18 | centers = [word.start() + word.duration() / 2. for word in alignment] 19 | duration = [word.duration() for word in alignment] 20 | 21 | # Plot scores 22 | axis.bar( 23 | centers, 24 | scores, 25 | duration, 26 | edgecolor='black') 27 | 28 | # Plot words and dividers 29 | for word in alignment: 30 | axis.text( 31 | word.start() + word.duration() / 2, 32 | .015, 33 | str(word), 34 | fontsize=10, 35 | rotation=90, 36 | horizontalalignment='center') 37 | axis.axvline( 38 | word.start(), 39 | color='gray', 40 | linewidth=.5, 41 | ymin=0., 42 | ymax=1., 43 | clip_on=False, 44 | linestyle='--') 45 | axis.axvline( 46 | alignment.duration(), 47 | color='gray', 48 | linewidth=.5, 49 | ymin=0., 50 | ymax=1., 51 | clip_on=False, 52 | linestyle='--') 53 | 54 | if targets is not None: 55 | 56 | # Plot targets 57 | axis.bar(centers, targets, duration) 58 | 59 | # Plot overlap 60 | overlap = torch.minimum(scores, targets) 61 | axis.bar(centers, overlap, duration, color='gray') 62 | 63 | return figure 64 | -------------------------------------------------------------------------------- /emphases/plot/scaling/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * -------------------------------------------------------------------------------- /emphases/plot/scaling/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | 4 | import emphases 5 | 6 | 7 | ############################################################################### 8 | # Scaling laws plot 9 | ############################################################################### 10 | 11 | 12 | def parse_args(): 13 | """Parse command-line arguments""" 14 | parser = argparse.ArgumentParser( 15 | description='Create scaling law figure') 16 | parser.add_argument( 17 | '--evaluations', 18 | type=str, 19 | nargs='+', 20 | required=True, 21 | help='The evaluations to plot') 22 | parser.add_argument( 23 | '--xlabel', 24 | type=str, 25 | required=True, 26 | help='Label for x axis') 27 | parser.add_argument( 28 | '--output_file', 29 | type=Path, 30 | required=True, 31 | help='The output jpg file') 32 | parser.add_argument( 33 | '--yticks', 34 | type=float, 35 | nargs='+', 36 | required=True, 37 | help='The y axis tick mark locations') 38 | parser.add_argument( 39 | '--sizes', 40 | type=int, 41 | nargs='+', 42 | help='The number of utterances used in each evaluation') 43 | parser.add_argument( 44 | '--scores', 45 | type=float, 46 | nargs='+', 47 | help='The Pearson Correlation y values') 48 | parser.add_argument( 49 | '--steps', 50 | type=int, 51 | nargs='+', 52 | help='The number of training steps') 53 | parser.add_argument( 54 | '--text_offsets', 55 | type=float, 56 | nargs='+', 57 | help='The amount to space the text below the plot point') 58 | return parser.parse_args() 59 | 60 | 61 | if __name__ == '__main__': 62 | emphases.plot.scaling.scaling_laws(**vars(parse_args())) 63 | -------------------------------------------------------------------------------- /emphases/plot/scaling/core.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import torch 4 | 5 | import emphases 6 | 7 | 8 | ############################################################################### 9 | # Plot scaling laws 10 | ############################################################################### 11 | 12 | 13 | def scaling_laws( 14 | evaluations, 15 | xlabel, 16 | output_file, 17 | yticks, 18 | scores=None, 19 | steps=None, 20 | sizes=None, 21 | text_offsets=None): 22 | """Plot scaling laws""" 23 | # Load evaluation results 24 | if scores is None or steps is None: 25 | scores, steps = [], [] 26 | for evaluation in evaluations: 27 | path, score = emphases.checkpoint.best_path( 28 | emphases.RUNS_DIR / evaluation) 29 | checkpoint = torch.load(path, map_location='cpu') 30 | scores.append(score) 31 | steps.append(checkpoint['step']) 32 | 33 | # Get x values 34 | x = [int(eval.split('-')[-1]) for eval in evaluations] 35 | 36 | # Create plot 37 | figure, axis = plt.subplots(figsize=(8, 2)) 38 | 39 | # Remove frame 40 | axis.spines['top'].set_visible(False) 41 | axis.spines['right'].set_visible(False) 42 | axis.spines['bottom'].set_visible(False) 43 | axis.spines['left'].set_visible(False) 44 | 45 | # Format x axis 46 | x_range = max(x) - min(x) 47 | axis.set_xlim([0, max(x) + 0.1 * x_range]) 48 | axis.get_xaxis().set_ticks(x) 49 | axis.set_xlabel(xlabel) 50 | axis.xaxis.set_ticks(x) 51 | axis.xaxis.set_ticklabels(x) 52 | 53 | # Format y axis 54 | axis.get_yaxis().set_ticks(yticks) 55 | axis.set_ylim([min(yticks) - .002, max(yticks) + .002]) 56 | axis.tick_params(axis=u'both', which=u'both',length=0) 57 | axis.set_ylabel('Pearson correlation') 58 | 59 | # Grid lines 60 | for tick in yticks: 61 | axis.axhline(tick, color='gray', linestyle='--', linewidth=.8) 62 | 63 | # Plot 64 | colors = ['blue', 'orange', 'purple', 'red'] 65 | for i in range(len(x)): 66 | axis.scatter(x[i], scores[i], color=colors[i]) 67 | 68 | # Default text offset 69 | if text_offsets is None: 70 | text_offsets = [0.011] * len(evaluations) 71 | 72 | # Annotate 73 | for i in range(len(evaluations)): 74 | text = f'steps={steps[i]}' 75 | if sizes is not None: 76 | text += f'\nutterances={sizes[i]}' 77 | axis.text( 78 | x[i], 79 | scores[i] - text_offsets[i], 80 | text, 81 | horizontalalignment='center') 82 | 83 | # Save 84 | figure.savefig(output_file, bbox_inches='tight', pad_inches=0, dpi=300) 85 | -------------------------------------------------------------------------------- /emphases/train/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | -------------------------------------------------------------------------------- /emphases/train/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import shutil 3 | from pathlib import Path 4 | 5 | import torchutil 6 | 7 | import emphases 8 | 9 | 10 | ############################################################################### 11 | # Entry point 12 | ############################################################################### 13 | 14 | 15 | def main(config, dataset, gpu=None): 16 | # Create output directory 17 | directory = emphases.RUNS_DIR / config.stem 18 | directory.mkdir(parents=True, exist_ok=True) 19 | 20 | # Save configuration 21 | shutil.copyfile(config, directory / config.name) 22 | 23 | # Train 24 | emphases.train(dataset, directory, gpu) 25 | 26 | # Get best checkpoint 27 | checkpoint = torchutil.checkpoint.best_path(directory)[0] 28 | 29 | # Evaluate 30 | emphases.evaluate.datasets(emphases.EVALUATION_DATASETS, checkpoint, gpu) 31 | 32 | 33 | def parse_args(): 34 | """Parse command-line arguments""" 35 | parser = argparse.ArgumentParser(description='Train a model') 36 | parser.add_argument( 37 | '--config', 38 | type=Path, 39 | help='The configuration file') 40 | parser.add_argument( 41 | '--dataset', 42 | default=emphases.TRAINING_DATASET, 43 | help='The dataset to train on') 44 | parser.add_argument( 45 | '--gpu', 46 | type=int, 47 | help='The gpu to run training on') 48 | return parser.parse_args() 49 | 50 | 51 | if __name__ == '__main__': 52 | main(**vars(parse_args())) 53 | -------------------------------------------------------------------------------- /emphases/train/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torchutil 3 | 4 | import emphases 5 | 6 | 7 | ############################################################################### 8 | # Training 9 | ############################################################################### 10 | 11 | 12 | @torchutil.notify('train') 13 | def train(dataset, directory, gpu=None): 14 | """Train a model""" 15 | 16 | # Get torch device 17 | device = torch.device('cpu' if gpu is None else f'cuda:{gpu}') 18 | 19 | ####################### 20 | # Create data loaders # 21 | ####################### 22 | 23 | torch.manual_seed(emphases.RANDOM_SEED) 24 | 25 | # Training data 26 | train_loader = emphases.data.loader(dataset, 'train', gpu) 27 | 28 | # Validation data 29 | if emphases.VALIDATION_DATASET == 'buckeye': 30 | 31 | # This is just for generating scaling law plots for the paper 32 | valid_loader = emphases.data.loader('buckeye', 'test', gpu) 33 | 34 | else: 35 | 36 | valid_loader = emphases.data.loader(dataset, 'valid', gpu) 37 | 38 | ################ 39 | # Create model # 40 | ################ 41 | 42 | model = emphases.Model().to(device) 43 | 44 | #################### 45 | # Create optimizer # 46 | #################### 47 | 48 | optimizer = torch.optim.Adam(model.parameters()) 49 | 50 | ############################## 51 | # Maybe load from checkpoint # 52 | ############################## 53 | 54 | path = torchutil.checkpoint.latest_path(directory) 55 | 56 | if path is not None: 57 | 58 | # Load model 59 | model, optimizer, state = torchutil.checkpoint.load( 60 | path, 61 | model, 62 | optimizer) 63 | epoch = state['epoch'] 64 | step = state['step'] 65 | score = state['score'] 66 | best = state['best'] 67 | 68 | else: 69 | 70 | # Train from scratch 71 | epoch, step, score, best = 0, 0, 0., 0. 72 | 73 | ######### 74 | # Train # 75 | ######### 76 | 77 | # Automatic mixed precision (amp) gradient scaler 78 | scaler = torch.cuda.amp.GradScaler() 79 | 80 | # Setup progress bar 81 | progress = torchutil.iterator( 82 | range(step, emphases.NUM_STEPS), 83 | f'Training {emphases.CONFIG}', 84 | step, 85 | emphases.NUM_STEPS) 86 | while step < emphases.NUM_STEPS: 87 | 88 | # Seed sampler 89 | train_loader.batch_sampler.set_epoch(epoch) 90 | 91 | for batch in train_loader: 92 | 93 | # Unpack batch 94 | ( 95 | features, 96 | frame_lengths, 97 | word_bounds, 98 | word_lengths, 99 | targets, 100 | _, # alignment 101 | _, # audio 102 | _ # stem 103 | ) = batch 104 | 105 | # Copy to GPU 106 | features = features.to(device) 107 | frame_lengths = frame_lengths.to(device) 108 | word_bounds = word_bounds.to(device) 109 | word_lengths = word_lengths.to(device) 110 | targets = targets.to(device) 111 | with torch.autocast(device.type): 112 | 113 | # Forward pass 114 | scores = model( 115 | features, 116 | frame_lengths, 117 | word_bounds, 118 | word_lengths) 119 | 120 | # Compute loss 121 | train_loss = loss( 122 | scores, 123 | targets, 124 | frame_lengths, 125 | word_bounds, 126 | word_lengths, 127 | training=True) 128 | 129 | ################## 130 | # Optimize model # 131 | ################## 132 | 133 | optimizer.zero_grad() 134 | 135 | # Backward pass 136 | scaler.scale(train_loss).backward() 137 | 138 | # Update weights 139 | scaler.step(optimizer) 140 | 141 | # Update gradient scaler 142 | scaler.update() 143 | 144 | ############ 145 | # Evaluate # 146 | ############ 147 | 148 | if step % emphases.LOG_INTERVAL == 0: 149 | score = evaluate( 150 | directory, 151 | step, 152 | model, 153 | gpu, 154 | 'valid', 155 | valid_loader) 156 | 157 | ################### 158 | # Save checkpoint # 159 | ################### 160 | 161 | if step >= 300 and score > best: 162 | torchutil.checkpoint.save( 163 | directory / f'{step:08d}.pt', 164 | model, 165 | optimizer, 166 | epoch=epoch, 167 | step=step, 168 | score=score, 169 | best=best) 170 | best = score 171 | 172 | # End training after a certain number of steps 173 | if step >= emphases.NUM_STEPS: 174 | break 175 | 176 | # Update training step count 177 | step += 1 178 | 179 | # Update progress bar 180 | progress.update() 181 | 182 | # Update epoch count 183 | epoch += 1 184 | 185 | # Close progress bar 186 | progress.close() 187 | 188 | # Save final model 189 | torchutil.checkpoint.save( 190 | directory / f'{step:08d}.pt', 191 | model, 192 | optimizer, 193 | epoch=epoch, 194 | step=step, 195 | score=score, 196 | best=best) 197 | 198 | 199 | ############################################################################### 200 | # Evaluation 201 | ############################################################################### 202 | 203 | 204 | def evaluate(directory, step, model, gpu, condition, loader): 205 | """Perform model evaluation""" 206 | device = 'cpu' if gpu is None else f'cuda:{gpu}' 207 | 208 | # Tensorboard audio and figures 209 | waveforms, figures = {}, {} 210 | 211 | # Prepare model for inference 212 | with emphases.inference_context(model): 213 | 214 | # Cache results to evaluate 215 | results = [] 216 | for i, batch in enumerate(loader): 217 | 218 | # Unpack batch 219 | ( 220 | features, 221 | frame_lengths, 222 | word_bounds, 223 | word_lengths, 224 | targets, 225 | alignments, 226 | audio, 227 | stems 228 | ) = batch 229 | 230 | # Copy to GPU 231 | features = features.to(device) 232 | frame_lengths = frame_lengths.to(device) 233 | word_bounds = word_bounds.to(device) 234 | word_lengths = word_lengths.to(device) 235 | targets = targets.to(device) 236 | 237 | # Forward pass 238 | logits = model( 239 | features, 240 | frame_lengths, 241 | word_bounds, 242 | word_lengths) 243 | 244 | # Cache results 245 | results.append(( 246 | logits.detach().cpu(), 247 | targets.detach().cpu(), 248 | word_lengths.detach().cpu())) 249 | 250 | # Add audio and figures 251 | if condition == 'valid' and i < emphases.PLOT_EXAMPLES: 252 | 253 | # Postprocess network output 254 | scores = emphases.postprocess(logits) 255 | 256 | # Add audio 257 | samples = emphases.convert.frames_to_samples(frame_lengths[0]) 258 | waveforms[f'audio/{stems[0]}'] = audio[0, :, :samples] 259 | 260 | # Add figure 261 | figures[stems[0]] = emphases.plot.scores( 262 | alignments[0], 263 | scores[0, 0, :word_lengths[0]].cpu(), 264 | targets[0, 0, :word_lengths[0]].cpu()) 265 | 266 | # Stop when we exceed some number of batches 267 | if i + 1 == emphases.LOG_STEPS: 268 | break 269 | 270 | # Setup batch statistics 271 | target_stats = emphases.evaluate.metrics.Statistics() 272 | predicted_stats = emphases.evaluate.metrics.Statistics() 273 | 274 | # Update statistics 275 | for logits, targets, word_lengths in results: 276 | target_stats.update( 277 | targets.to(device), 278 | word_lengths.to(device)) 279 | predicted_stats.update( 280 | emphases.postprocess(logits.to(device)), 281 | word_lengths.to(device)) 282 | 283 | # Setup evaluation metrics 284 | metrics = emphases.evaluate.Metrics(predicted_stats, target_stats) 285 | 286 | # Update metrics 287 | for logits, targets, word_lengths in results: 288 | metrics.update( 289 | logits.to(device), 290 | targets.to(device), 291 | word_lengths.to(device)) 292 | 293 | # Format results 294 | scalars = { 295 | f'{key}/{condition}': value for key, value in metrics().items()} 296 | 297 | # Write to tensorboard 298 | torchutil.tensorboard.update( 299 | directory, 300 | step, 301 | scalars=scalars, 302 | figures=figures, 303 | audio=waveforms, 304 | sample_rate=emphases.SAMPLE_RATE) 305 | 306 | # Return Pearson correlation 307 | return scalars[f'pearson_correlation/{condition}'] 308 | 309 | 310 | ############################################################################### 311 | # Loss function 312 | ############################################################################### 313 | 314 | 315 | def loss( 316 | scores, 317 | targets, 318 | frame_lengths, 319 | word_bounds, 320 | word_lengths, 321 | training=False, 322 | loss_fn=emphases.LOSS): 323 | """Compute masked loss""" 324 | if training and emphases.DOWNSAMPLE_LOCATION == 'inference': 325 | 326 | # If we are not downsampling the network output before the loss, we 327 | # must upsample the targets 328 | targets = emphases.upsample( 329 | targets, 330 | word_bounds, 331 | word_lengths, 332 | frame_lengths) 333 | 334 | # Linear interpolation can cause out-of-range 335 | if emphases.UPSAMPLE_METHOD == 'linear': 336 | targets = torch.clamp(targets, min=0., max=1.) 337 | 338 | # Frame resolution sequence mask 339 | mask = emphases.model.mask_from_lengths(frame_lengths) 340 | 341 | else: 342 | 343 | # Word resolution sequence mask 344 | mask = emphases.model.mask_from_lengths(word_lengths) 345 | 346 | # Compute masked loss 347 | if loss_fn == 'bce': 348 | return torch.nn.functional.binary_cross_entropy_with_logits( 349 | scores[mask], 350 | targets[mask]) 351 | elif loss_fn == 'mse': 352 | return torch.nn.functional.mse_loss(scores[mask], targets[mask]) 353 | raise ValueError(f'Loss {loss_fn} is not recognized') 354 | -------------------------------------------------------------------------------- /eval/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/eval/.gitkeep -------------------------------------------------------------------------------- /notebooks/select-speakers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "f473bbab-e880-4f10-be02-f2abf38ca9ad", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "%load_ext autoreload\n", 11 | "%autoreload 2" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "6e996f74-4c77-469a-a333-062febcaa78b", 18 | "metadata": { 19 | "tags": [] 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import random\n", 24 | "\n", 25 | "import IPython.display as ipd\n", 26 | "import torchaudio\n", 27 | "\n", 28 | "import emphases" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "9004ca30-9fa2-436b-ad2c-b778b895e6f6", 35 | "metadata": { 36 | "tags": [] 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "dataset = 'libritts'\n", 41 | "directory = emphases.CACHE_DIR / dataset\n", 42 | "files = list(directory.rglob('*.wav'))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "7c23645c-95c3-45f7-8945-40ca6ff0c064", 49 | "metadata": { 50 | "tags": [] 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "speakers = sorted(list(set(file.stem.split('_')[0] for file in files)))\n", 55 | "speaker_sizes = {speaker: 0. for speaker in speakers}\n", 56 | "for file in files:\n", 57 | " info = torchaudio.info(file)\n", 58 | " speaker_sizes[file.stem.split('_')[0]] += info.num_frames / info.sample_rate\n", 59 | "total = sum(speaker_sizes.values())\n", 60 | "total" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "id": "fe70a29b-d5fc-4a32-b08f-21c67776b420", 67 | "metadata": { 68 | "tags": [] 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "current = [\n", 73 | " # Top 5 Female\n", 74 | " 40,\n", 75 | " 669,\n", 76 | " 4362,\n", 77 | " 5022,\n", 78 | " 8123,\n", 79 | " \n", 80 | " # Additional female speakers to get to 1/8th \n", 81 | " 5022,\n", 82 | " 696,\n", 83 | " 6272,\n", 84 | " 5163,\n", 85 | "\n", 86 | " # Top 5 Male\n", 87 | " 196,\n", 88 | " 460,\n", 89 | " 1355,\n", 90 | " 3664,\n", 91 | " 7067, # uses character voices\n", 92 | " \n", 93 | " # Additional male speakers to get to 1/8th \n", 94 | " 405,\n", 95 | " 6437,\n", 96 | " 446, # uses character voices\n", 97 | " 4397\n", 98 | "]\n", 99 | "current_total = sum(speaker_sizes[str(speaker)] for speaker in current) \n", 100 | "current_total" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "9abbbc3f-5d18-48d2-ae57-da36fa322da9", 107 | "metadata": { 108 | "tags": [] 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "current_total / total / (1/8)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "944611a7-7a50-4b86-b86d-e79a83d91f8d", 119 | "metadata": { 120 | "tags": [] 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "speaker = 4397\n", 125 | "files = [file for file in (directory / 'audio').rglob('*.wav') if file.stem.startswith(f'{speaker}_')]\n", 126 | "ipd.display(ipd.Audio(random.choice(files)))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "f1c42bc9-95bc-4ceb-8a88-dc6b60867bb1", 133 | "metadata": { 134 | "tags": [] 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "candidates = sorted(speaker_sizes.items(), key=lambda item: item[1], reverse=True)\n", 139 | "candidates" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "2e7925d3-6c35-459e-af2c-f9af74c21bf7", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "env", 154 | "language": "python", 155 | "name": "env" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.9.16" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 5 172 | } 173 | -------------------------------------------------------------------------------- /results/scaling-annotators.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/results/scaling-annotators.pdf -------------------------------------------------------------------------------- /results/scaling-data.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/results/scaling-data.pdf -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | # Runs experiments in the paper 2 | # "Crowdsourced and Automatic Speech Prominence Estimation" 3 | 4 | # Args 5 | # $1 - the GPU index 6 | 7 | SCRIPTDIR="$( dirname -- "$0"; )" 8 | 9 | #################################### 10 | # Annotator redundancy experiments # 11 | #################################### 12 | 13 | 14 | # N.B. - These experiments require Buckeye for evaluation and are therefore 15 | # commented out (see note in README). 16 | 17 | # # 1/64; 8 annotations 18 | # rm -rf data/cache/crowdsource/* 19 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/64-8.py 20 | # python -m emphases.data.preprocess --gpu $1 21 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/64-8.py 22 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/64-8.py --gpu $1 23 | 24 | # # 1/32; 4 annotations 25 | # rm -rf data/cache/crowdsource/* 26 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/32-4.py 27 | # python -m emphases.data.preprocess --gpu $1 28 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/32-4.py 29 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/32-4.py --gpu $1 30 | 31 | # # 1/16; 2 annotations 32 | # rm -rf data/cache/crowdsource/* 33 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/16-2.py 34 | # python -m emphases.data.preprocess --gpu $1 35 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/16-2.py 36 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/16-2.py --gpu $1 37 | 38 | # # 1/8; 1 annotations 39 | # rm -rf data/cache/crowdsource/* 40 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/8-1.py 41 | # python -m emphases.data.preprocess --gpu $1 42 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/8-1.py 43 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/8-1.py --gpu $1 44 | 45 | # # Plot results 46 | # python -m emphases.plot.scaling \ 47 | # --evaluations 8-1 16-2 32-4 64-8 \ 48 | # --xlabel "Annotators per utterance" \ 49 | # --output_file results/scaling-annotators.pdf \ 50 | # --sizes 3200 1600 800 400 \ 51 | # --scores 0.686 0.683 0.667 0.664 \ 52 | # --steps 967 933 567 467 \ 53 | # --yticks 0.66 0.67 0.68 0.69 \ 54 | # --text_offsets 0.007 0.01 0.007 0.007 55 | 56 | 57 | # #################################### 58 | # # Dataset size scaling experiments # 59 | # #################################### 60 | 61 | 62 | # N.B. - These experiments require Buckeye for evaluation and are therefore 63 | # commented out (see note in README). 64 | 65 | # # 400 utterances 66 | # rm -rf data/cache/crowdsource/* 67 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/400.py 68 | # python -m emphases.data.preprocess --gpu $1 69 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/400.py 70 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/400.py --gpu $1 71 | 72 | # # 800 utterances 73 | # rm -rf data/cache/crowdsource/* 74 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/800.py 75 | # python -m emphases.data.preprocess --gpu $1 76 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/800.py 77 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/800.py --gpu $1 78 | 79 | # # 1600 utterances 80 | # rm -rf data/cache/crowdsource/* 81 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/1600.py 82 | # python -m emphases.data.preprocess --gpu $1 83 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/1600.py 84 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/1600.py --gpu $1 85 | 86 | # # 3200 utterances 87 | # rm -rf data/cache/crowdsource/* 88 | # python -m emphases.data.download --config $SCRIPTDIR/config/scaling/3200.py 89 | # python -m emphases.data.preprocess --gpu $1 90 | # python -m emphases.partition --config $SCRIPTDIR/config/scaling/3200.py 91 | # python -m emphases.train --config $SCRIPTDIR/config/scaling/3200.py --gpu $1 92 | 93 | # # Plot results 94 | # python -m emphases.plot.scaling \ 95 | # --evaluations 400 800 1600 3200 \ 96 | # --xlabel Utterances \ 97 | # --output_file results/scaling-data.pdf \ 98 | # --yticks 0.63 0.65 0.67 0.69 \ 99 | # --scores 0.633 0.657 0.678 0.687 \ 100 | # --steps 400 500 767 1433 \ 101 | # --text_offsets 0.007 0.007 0.007 0.007 102 | 103 | 104 | ############## 105 | # Best model # 106 | ############## 107 | 108 | 109 | python -m emphases.data.download 110 | python -m emphases.data.preprocess --gpu $1 111 | python -m emphases.partition 112 | python -m emphases.train --config $SCRIPTDIR/config/base.py --gpu $1 113 | 114 | 115 | ############# 116 | # Ablations # 117 | ############# 118 | 119 | 120 | python -m emphases.train --config $SCRIPTDIR/config/hparam-search/mse.py --gpu $1 121 | 122 | 123 | ############## 124 | # Downsample # 125 | ############## 126 | 127 | 128 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-inference.py --gpu $1 129 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-intermediate.py --gpu $1 130 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-input.py --gpu $1 131 | python -m emphases.train --config $SCRIPTDIR/config/downsample/average-loss.py --gpu $1 132 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-inference.py --gpu $1 133 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-intermediate.py --gpu $1 134 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-input.py --gpu $1 135 | python -m emphases.train --config $SCRIPTDIR/config/downsample/center-loss.py --gpu $1 136 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-inference.py --gpu $1 137 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-intermediate.py --gpu $1 138 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-input.py --gpu $1 139 | python -m emphases.train --config $SCRIPTDIR/config/downsample/max-loss.py --gpu $1 140 | 141 | 142 | #################################### 143 | # Large-scale automatic annotation # 144 | #################################### 145 | 146 | 147 | python -m emphases.data.download --datasets automatic --gpu $1 148 | python -m emphases.partition --datasets automatic 149 | python -m emphases.data.preprocess --datasets automatic --gpu $1 150 | python -m emphases.train --config $SCRIPTDIR/config/scaling/base-automatic.py --dataset automatic --gpu $1 151 | 152 | 153 | ############# 154 | # Baselines # 155 | ############# 156 | 157 | 158 | python -m emphases.evaluate --config $SCRIPTDIR/config/baselines/prominence.py 159 | -------------------------------------------------------------------------------- /runs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/interactiveaudiolab/emphases/25de3cbc7896c67d1925149efb427abd0d27f4da/runs/.gitkeep -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | 4 | with open('README.md', encoding='utf8') as file: 5 | long_description = file.read() 6 | 7 | 8 | setup( 9 | name='emphases', 10 | description='Crowdsourced and Automatic Speech Prominence Estimation', 11 | version='0.0.2', 12 | author='Interactive Audio Lab', 13 | author_email='interactiveaudiolab@gmail.com', 14 | url='https://github.com/interactiveaudiolab/emphases', 15 | install_requires=[ 16 | 'GPUtil', 17 | 'huggingface-hub', 18 | 'librosa', 19 | 'matplotlib', 20 | 'numpy', 21 | 'penn', 22 | 'pycwt', 23 | 'pyfoal', 24 | 'pypar', 25 | 'pyyaml', 26 | 'reseval', 27 | 'scipy', 28 | 'torch', 29 | 'torchutil', 30 | 'torchaudio', 31 | 'yapecs'], 32 | packages=find_packages(), 33 | package_data={'emphases': ['assets/*', 'assets/*/*']}, 34 | long_description=long_description, 35 | long_description_content_type='text/markdown', 36 | keywords=['annotatation', 'audio', 'emphasis', 'prominence', 'speech'], 37 | classifiers=['License :: OSI Approved :: MIT License'], 38 | license='MIT') 39 | --------------------------------------------------------------------------------