├── .gitignore ├── LICENSE ├── README.md ├── acoustic_forced_alignment ├── .gitignore ├── README.md ├── align_tg_words.py ├── assets │ ├── 2001000001.lab │ └── 2001000001.wav ├── build_dataset.py ├── check_tg.py ├── combine_tg.py ├── dictionaries │ └── opencpop-extension.txt ├── distribution.py ├── enhance_tg.py ├── reformat_wavs.py ├── requirements.txt ├── select_test_set.py ├── slice_tg.py ├── summary_pitch.py ├── validate_labels.py └── validate_lengths.py ├── midi-recognition ├── README.md ├── extract_midi.py └── merge_wavs.py └── variance-temp-solution ├── .gitignore ├── README.md ├── add_ph_num.py ├── add_ph_num_advanced.py ├── assets └── .gitkeep ├── convert_ds.py ├── convert_txt.py ├── correct_cents.py ├── eliminate_short.py ├── estimate_midi.py ├── get_pitch.py ├── requirements.txt └── rmvpe ├── __init__.py ├── constants.py ├── deepunet.py ├── inference.py ├── model.py ├── seq.py ├── spec.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .vscode 3 | *.pyc 4 | __pycache__/ 5 | local_tools/ 6 | /venv/ 7 | 8 | .ipynb_checkpoints/ 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2023, Team OpenVPI 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MakeDiffSinger 2 | Pipelines and tools to build your own DiffSinger dataset. 3 | 4 | For the recommended standard dataset making pipelines, see: 5 | 6 | - acoustic-forced-alignment: make dataset from scratch with MFA for acoustic model training 7 | - variance-temp-solution: temporary solution to extend acoustic datasets into variance datasets 8 | 9 | For other useful pipelines and tools for making a dataset, welcome to raise issues or submit PRs. 10 | 11 | ## DiffSinger dataset structure 12 | 13 | - dataset1/ 14 | - raw/ 15 | - wavs/ 16 | - recording1.wav 17 | - recording2.wav 18 | - ... 19 | - transcriptions.csv 20 | - dataset2/ 21 | - raw/ 22 | - wavs/ 23 | - ... 24 | - transcriptions.csv 25 | - ... 26 | 27 | ## Essential tools to process and label your datasets 28 | 29 | Dataset tools now have their own repository: [dataset-tools](https://github.com/openvpi/dataset-tools). 30 | 31 | There are mainly 3 components: 32 | 33 | - AudioSlicer: Slice your recordings into short segments 34 | - MinLabel: Label *.lab files containing word transcriptions for acoustic model training. 35 | - SlurCutter: Edit MIDI sequence in *.ds files for variance model training. -------------------------------------------------------------------------------- /acoustic_forced_alignment/.gitignore: -------------------------------------------------------------------------------- 1 | assets/mfa-*/ 2 | assets/*.zip 3 | segments/ 4 | textgrids/ 5 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/README.md: -------------------------------------------------------------------------------- 1 | # Making Datasets from Scratch (Forced Alignment) 2 | 3 | This pipeline will guide you to build your dataset from raw recordings with MFA (Montreal Forced Aligner). 4 | 5 | ## 0. Requirements 6 | 7 | This pipeline will require your dictionary having its corresponding MFA pretrained model. You can see currently supported dictionaries and download their MFA models in the table below: 8 | 9 | | dictionary name | dictionary file | MFA model | 10 | |:------------------:|:----------------------:|:--------------------------------------------------------------------------------------------:| 11 | | Opencpop extension | opencpop-extension.txt | [link](https://huggingface.co/datasets/fox7005/tool/resolve/main/mfa-opencpop-extension.zip) | 12 | 13 | Your recordings must meet the following conditions: 14 | 15 | 1. They must be in one single folder. Files in sub-folders will be ignored. 16 | 2. They must be in WAV format. 17 | 3. They must have a sampling rate higher than 32 kHz. 18 | 4. They should be clean, unaccompanied voices with no significant noise or reverb. 19 | 5. They should contain only voices from one single human. 20 | 21 | **NOTICE:** Before you train a model, you must obtain permission from the copyright holder of the dataset and make sure the provider is fully aware that you will train a model from their data, that you will or will not distribute the synthesized voices and model weights, and the potential risks of this kind of activity. 22 | 23 | ## 1. Clone repo and install dependencies 24 | 25 | ```bash 26 | git clone https://github.com/openvpi/MakeDiffSinger.git 27 | cd MakeDiffSinger/acoustic-forced-alignment 28 | conda create -n mfa python=3.8 --yes # you must use a Conda environment! 29 | conda activate mfa 30 | conda install -c conda-forge montreal-forced-aligner==2.0.6 --yes # install MFA 31 | pip install -r requirements.txt # install other requirements 32 | ``` 33 | 34 | ## 2. Prepare recordings and transcriptions 35 | 36 | ### 2.1 Audio slicing 37 | 38 | The raw data must be sliced into segments of about 5-15 seconds. We recommend using [AudioSlicer](../README.md#essential-tools-to-process-and-label-your-datasets), a simple GUI application that can automatically slice audio files via silence detection. 39 | 40 | Run the following command to validate your segment lengths and count the total length of your sliced segments: 41 | 42 | ```bash 43 | python validate_lengths.py --dir path/to/your/segments/ 44 | ``` 45 | 46 | ### 2.2 Label the segments 47 | 48 | All segments should have their transcriptions (or lyrics) annotated. See [assets/2001000001.wav](assets/2001000001.wav) and its corresponding label [assets/2001000001.lab](assets/2001000001.lab) as an example. 49 | 50 | Each segment should have one annotation file with the same filename as it and `.lab` extension, and placed in the same directory. In the annotation file, you should write all syllables sung or spoken in this segment. Syllables should be split by space, and only syllables that appears in the dictionary are allowed. In addition, all phonemes in the dictionary should be covered in the annotations. Please note that `SP`, `AP` and `` should not be included in the labels although they are in your final phoneme set. 51 | 52 | We developed [MinLabel](../README.md#essential-tools-to-process-and-label-your-datasets), a simple yet efficient tool to help finishing this step. 53 | 54 | Once you finish labeling, run the following command to validate your labels: 55 | 56 | ```bash 57 | python validate_labels.py --dir path/to/your/segments/ --dictionary path/to/your/dictionary.txt 58 | ``` 59 | 60 | This will ensure: 61 | 62 | - All recordings have their corresponding labels. 63 | - There are no unrecognizable phonemes that does not appear in the dictionary. 64 | - All phonemes in the dictionary are covered by the labels. 65 | 66 | If there are failed checks, please fix them and run again. 67 | 68 | A summary of your phoneme coverage will be generated. If there are some phonemes that have extremely few occurrences (for example, less than 20), it is highly recommended to add more recordings to cover these phonemes. 69 | 70 | ## 3. Forced Alignment 71 | 72 | ### 3.1 Reformat recordings 73 | 74 | Given the transcriptions of each segment, we are able to align the phoneme sequence to its corresponding audio, thus obtaining position and duration information of each phoneme. 75 | 76 | We use [Montreal Forced Aligner](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to do forced phoneme alignment. 77 | 78 | MFA fails on some platforms if the WAVs are not in 16kHz 16bit PCM format. The following command will reformat your recordings and copy the labels to another temporary directory. You may delete those temporary files afterwards. 79 | 80 | ```bash 81 | python reformat_wavs.py --src path/to/your/segments/ --dst path/to/tmp/dir/ 82 | ``` 83 | 84 | NOTE: `--normalize` can be added to normalize the audio files with respect to the peak value of the whole segments. This is especially helpful on aspiration detection during TextGrid enhancement if the original segments are too quite. 85 | 86 | ### 3.2 Run MFA on the corpus 87 | 88 | MFA will align your labels to your recordings and save the results to TextGrid files. 89 | 90 | Download the MFA model and run the following command: 91 | 92 | ```bash 93 | mfa align path/to/your/segments/ path/to/your/dictionary.txt path/to/your/model.zip path/to/your/textgrids/ --beam 100 --clean --overwrite 94 | ``` 95 | 96 | Run the following command to check if all TextGrids are successfully generated: 97 | 98 | ```bash 99 | python check_tg.py --wavs path/to/your/segments/ --tg path/to/your/textgrids/ 100 | ``` 101 | 102 | If the checks above fails, or the results are not good, please try another `--beam` value and run the MFA again. TextGrids generated by MFA are still raw and need further processing, so please do not edit them at this time. 103 | 104 | ### 3.3 Enhance and finish the TextGrids 105 | 106 | MFA results might not be good on some long utterances. In this section, we: 107 | 108 | - Try to reduce errors for long utterances 109 | - Detect `AP`s and add `SP`s which have not been labeled before. 110 | 111 | Run: 112 | 113 | ```bash 114 | python enhance_tg.py --wavs path/to/your/segments/ --dictionary path/to/your/dictionary.txt --src path/to/raw/textgrids/ --dst path/to/final/textgrids/ 115 | ``` 116 | 117 | NOTE: There are other useful arguments of this script. If you understand them, you can try to get better results through adjusting those parameters. 118 | 119 | The final TextGrids can be saved for future use. 120 | 121 | If you are interested in the word-level pitch distribution of your dataset, run the following command: 122 | 123 | ```bash 124 | python summary_pitch.py --wavs path/to/your/segments/ --tg path/to/final/textgrids/ 125 | ``` 126 | 127 | ### 3.4 (Optional) Manual TextGrids refinement 128 | 129 | With steps above, the TextGrids we get contains 2 tiers: the words and the phones. Manual refinement to your TextGrids may take lots of effort but will boost the performance and stability of your model. 130 | 131 | This section is a recommended (but not required) way to refine your TextGrids manually. Before you start, an additional dependency to achieve natural sorting needs to be installed: 132 | 133 | ```bash 134 | pip install natsort 135 | ``` 136 | 137 | #### 3.4.1 Combine the recordings and TextGrids 138 | 139 | A full dataset can contain hundreds or thousands of auto-sliced recording segments and their corresponding TextGrids. The following command will combine them into long ones: 140 | 141 | ```bash 142 | python combine_tg.py --wavs path/to/your/segments/ --tg path/to/your/final/textgrids/ --out path/to/your/combined/textgrids/ 143 | ``` 144 | 145 | This will combine all items with same name except their suffixes and add a `sentences` tier in the combined TextGrids. The new sentences tier controls how the long combined recordings are split into short sentences. If you have other suffix pattern (default: `"_\d+"`) or want to change the bit-depth (default: PCM_16) of the combined recordings, see `python combine_tg.py --help`. 146 | 147 | #### 3.4.2 Manual editing 148 | 149 | TextGrids can be viewed and edited with [Praat](https://github.com/praat/praat) or [vLabeler](https://github.com/sdercolin/vlabeler) (recommended). 150 | 151 | The editing mainly involves the sentences tier and the phones tier. When editing, please ensure the sentences tier is aligned with the words and phones tier; but it is not required to align the words tier to the phones tier. If you want to remove a sentence or not to include one area in any sentences, just leave an empty mark on that area. 152 | 153 | #### 3.4.3 Slice the recordings and TextGrids 154 | 155 | After manual editing is finished, the words tier can be automatically re-aligned to the phones tier. Run: 156 | 157 | ```bash 158 | python align_tg_words.py --tg path/to/your/combined/textgrids --dictionary path/to/your/dictionary.txt --overwrite 159 | ``` 160 | 161 | NOTE 1: This will overwrite your TextGrid files. You can back them up before running the command, or specify another output directory with `--out` option. 162 | 163 | NOTE 2: This script is also compatible with segmented 2-tier TextGrids. 164 | 165 | Then the TextGrids and recordings can be sliced according to the boundaries stored in the sentences tiers. Run: 166 | 167 | ```bash 168 | python slice_tg.py --wavs path/to/your/combined/textgrids/ --out path/to/your/sliced/textgrids/refined/ 169 | ``` 170 | 171 | By default, the output segments will be re-numbered like `item_000`, `item_001`, ..., `item_XXX`. If you want to use the marks stored in the sentences tier as the filenames, or want to change the bit-depth (default: PCM_16) of the sliced recordings, or control other behaviors, see `python slice_tg.py --help`. 172 | 173 | Now you can use these manually refined and re-sliced TextGrids and recordings for further steps. 174 | 175 | ## 4. Build the final dataset 176 | 177 | The TextGrids need to be collected into a transcriptions.csv file as the final transcriptions. The CSV file will include the following columns: 178 | 179 | - name: the segment name 180 | - ph_seq: the phoneme sequence 181 | - ph_dur: the phoneme duration 182 | 183 | The recordings will be arranged like [this](../README.md#diffsinger-dataset-structure). 184 | 185 | Run: 186 | 187 | ```bash 188 | python build_dataset.py --wavs path/to/your/segments/ --tg path/to/final/textgrids/ --dataset path/to/your/dataset/ 189 | ``` 190 | 191 | NOTE 1: This will insert random silence parts around each segments by default for better `SP` stability. If you do not need these silence parts, for example, if your TextGrids have been manually refined, please use the `--skip_silence_insertion` option. 192 | 193 | NOTE 2: `--wav_subtype` can be used to specify the bit-depth of the saved WAV files. Options are `PCM_16` (default), `PCM_24`, `PCM_32`, `FLOAT`, and `DOUBLE`. 194 | 195 | After doing all things above, you should put it into data/ of the DiffSinger main repository. Now, your dataset can be used to train DiffSinger acoustic models. If you want to train DiffSinger variance models, please follow instructions [here](../variance-temp-solution/README.md). 196 | 197 | ## 5. Write configuration file 198 | 199 | Copy the template configration file from `configs/templates` in the DiffSinger repository to your data folder, or a new folder if working with multi-speaker model. Specify required fields in the configurations, check `DiffSinger/docs/ConfigurationSchemas.md` for help on the meanings of those fields. 200 | 201 | For automatic validation set selection, you can leave the following field as empty. If the field is not empty, the script will prompt a overwrite confirmation later. 202 | ```yaml 203 | ... 204 | test_prefixes: 205 | ... 206 | ``` 207 | 208 | And run: 209 | ```bash 210 | python select_test_set.py path/to/your/config.yaml [--rel_path ] 211 | ``` 212 | 213 | NOTE 1: `--rel_path` is probably necessary if there are relative paths in your config file. If only absolute paths exist in it, you can omit this argument. 214 | 215 | NOTE 2: There are other useful arguments of this script. You can use them to change the total number of validation samples. 216 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/align_tg_words.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import click 4 | import textgrid 5 | import tqdm 6 | 7 | 8 | @click.command(help='Align words tiers in TextGrids to phones tiers') 9 | @click.option('--tg', required=True, help='Path to TextGrids (2-tier or 3-tier format)') 10 | @click.option('--dictionary', required=True, help='Path to the dictionary file') 11 | @click.option( 12 | '--out', required=False, 13 | help='Path to save the aligned TextGrids (defaults to the input directory)' 14 | ) 15 | @click.option('--overwrite', is_flag=True, help='Overwrite existing files') 16 | def align_tg_words(tg, dictionary, out, overwrite): 17 | tg_path_in = pathlib.Path(tg) 18 | dict_path = pathlib.Path(dictionary) 19 | tg_path_out = pathlib.Path(out) if out is not None else tg_path_in 20 | tg_path_out.mkdir(parents=True, exist_ok=True) 21 | 22 | with open(dict_path, 'r', encoding='utf8') as f: 23 | rules = [ln.strip().split('\t') for ln in f.readlines()] 24 | dictionary = { 25 | 'SP': ['SP'], 26 | 'AP': ['AP'] 27 | } 28 | phoneme_set = {'SP', 'AP'} 29 | for r in rules: 30 | phonemes = r[1].split() 31 | dictionary[r[0]] = phonemes 32 | phoneme_set.update(phonemes) 33 | 34 | for tgfile in tqdm.tqdm(tg_path_in.glob('*.TextGrid')): 35 | tg = textgrid.TextGrid() 36 | tg.read(tgfile) 37 | old_words_tier: textgrid.IntervalTier = tg[-2] 38 | if old_words_tier.name != 'words': 39 | raise ValueError( 40 | f'Invalid tier name or order in \'{tgfile}\'. ' 41 | f'The words tier should be the 1st tier of a 2-tier TextGrid, ' 42 | f'or the 2nd tier of a 3-tier TextGrid.' 43 | ) 44 | phones_tier: textgrid.IntervalTier = tg[-1] 45 | new_words_tier = textgrid.IntervalTier(name='words') 46 | word_seq = [i.mark for i in old_words_tier] 47 | word_div = [] 48 | ph_seq = [i.mark for i in phones_tier] 49 | ph_dur = [i.duration() for i in phones_tier] 50 | idx = 0 51 | for i, word in enumerate(word_seq): 52 | if word not in dictionary: 53 | raise ValueError(f'Error invalid word in \'{tgfile}\' at {i}: {word}') 54 | word_ph_seq = dictionary[word] 55 | ph_num = len(word_ph_seq) 56 | word_div.append(ph_num) 57 | if word_ph_seq != ph_seq[idx: idx + ph_num]: 58 | print( 59 | f'Warning: word and phones mismatch in \'{tgfile}\' ' 60 | f'at word {i}, phone {idx}: {word} => {ph_seq[idx: idx + len(word_ph_seq)]}' 61 | ) 62 | idx += ph_num 63 | for i, phone in enumerate(ph_seq): 64 | if phone not in phoneme_set: 65 | raise ValueError(f'Error: invalid phone in \'{tgfile}\' at {i}: {phone}') 66 | if sum(word_div) != len(ph_dur): 67 | raise ValueError( 68 | f'Error: word_div does not sum to number of phones in \'{tgfile}\'. ' 69 | f'Check the warnings above for more detailed mismatching positions.' 70 | ) 71 | start = 0. 72 | idx = 0 73 | for j in range(len(word_seq)): 74 | end = start + sum(ph_dur[idx: idx + word_div[j]]) 75 | new_words_tier.add(minTime=start, maxTime=end, mark=word_seq[j]) 76 | start = end 77 | idx += word_div[j] 78 | tg.tiers[-2] = new_words_tier 79 | tg_file_out = tg_path_out / tgfile.name 80 | if tg_file_out.exists() and not overwrite: 81 | raise FileExistsError(str(tg_file_out)) 82 | tg.write(tg_file_out) 83 | 84 | 85 | if __name__ == '__main__': 86 | align_tg_words() 87 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/assets/2001000001.lab: -------------------------------------------------------------------------------- 1 | gan shou ting zai wo fa duan de zhi jian -------------------------------------------------------------------------------- /acoustic_forced_alignment/assets/2001000001.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvpi/MakeDiffSinger/ca134d36dc8eec06002a72cd0a59257abcf7bb84/acoustic_forced_alignment/assets/2001000001.wav -------------------------------------------------------------------------------- /acoustic_forced_alignment/build_dataset.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pathlib 3 | import random 4 | 5 | import click 6 | import librosa 7 | import numpy as np 8 | import soundfile 9 | import tqdm 10 | from textgrid import TextGrid 11 | 12 | 13 | @click.command(help='Collect phoneme alignments into transcriptions.csv') 14 | @click.option('--wavs', required=True, help='Path to the segments directory') 15 | @click.option('--tg', required=True, help='Path to the final TextGrids directory') 16 | @click.option('--dataset', required=True, help='Path to dataset directory') 17 | @click.option('--skip_silence_insertion', is_flag=True, show_default=True, 18 | help='Do not insert silence around segments') 19 | @click.option('--wav_subtype', default="PCM_16", show_default=True, 20 | help='WAV subtype') 21 | def build_dataset(wavs, tg, dataset, skip_silence_insertion, wav_subtype): 22 | wavs = pathlib.Path(wavs) 23 | tg_dir = pathlib.Path(tg) 24 | del tg 25 | dataset = pathlib.Path(dataset) 26 | filelist = list(wavs.glob('*.wav')) 27 | 28 | dataset.mkdir(parents=True, exist_ok=True) 29 | (dataset / 'wavs').mkdir(exist_ok=True) 30 | transcriptions = [] 31 | samplerate = 44100 32 | min_sil = int(0.1 * samplerate) 33 | max_sil = int(0.5 * samplerate) 34 | for wavfile in tqdm.tqdm(filelist): 35 | y, _ = librosa.load(wavfile, sr=samplerate, mono=True) 36 | tgfile = tg_dir / wavfile.with_suffix('.TextGrid').name 37 | tg = TextGrid() 38 | tg.read(str(tgfile)) 39 | ph_seq = [ph.mark for ph in tg[1]] 40 | ph_dur = [ph.maxTime - ph.minTime for ph in tg[1]] 41 | if not skip_silence_insertion: 42 | if random.random() < 0.5: 43 | len_sil = random.randrange(min_sil, max_sil) 44 | y = np.concatenate((np.zeros((len_sil,), dtype=np.float32), y)) 45 | if ph_seq[0] == 'SP': 46 | ph_dur[0] += len_sil / samplerate 47 | else: 48 | ph_seq.insert(0, 'SP') 49 | ph_dur.insert(0, len_sil / samplerate) 50 | if random.random() < 0.5: 51 | len_sil = random.randrange(min_sil, max_sil) 52 | y = np.concatenate((y, np.zeros((len_sil,), dtype=np.float32))) 53 | if ph_seq[-1] == 'SP': 54 | ph_dur[-1] += len_sil / samplerate 55 | else: 56 | ph_seq.append('SP') 57 | ph_dur.append(len_sil / samplerate) 58 | ph_seq = ' '.join(ph_seq) 59 | ph_dur = ' '.join([str(round(d, 6)) for d in ph_dur]) 60 | soundfile.write(dataset / 'wavs' / wavfile.name, y, samplerate, subtype=wav_subtype) 61 | transcriptions.append({'name': wavfile.stem, 'ph_seq': ph_seq, 'ph_dur': ph_dur}) 62 | 63 | with open(dataset / 'transcriptions.csv', 'w', encoding='utf8', newline='') as f: 64 | writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur']) 65 | writer.writeheader() 66 | writer.writerows(transcriptions) 67 | 68 | print(f'All wavs and transcriptions saved in {dataset}') 69 | 70 | 71 | if __name__ == '__main__': 72 | build_dataset() 73 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/check_tg.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import click 4 | import tqdm 5 | 6 | 7 | @click.command('Check if all TextGrids are generated') 8 | @click.option('--wavs', required=True, help='Path to the segments directory') 9 | @click.option('--tg', required=True, help='Path to the TextGrids directory') 10 | def check_tg(wavs, tg): 11 | wavs = pathlib.Path(wavs) 12 | tg = pathlib.Path(tg) 13 | missing = [] 14 | filelist = list(wavs.glob('*.wav')) 15 | for wavfile in tqdm.tqdm(filelist): 16 | tgfile = tg / wavfile.with_suffix('.TextGrid').name 17 | if not tgfile.exists(): 18 | missing.append(tgfile) 19 | if len(missing) > 0: 20 | print( 21 | 'These TextGrids are missing! There are possible severe errors in labels of those corresponding segments. ' 22 | 'If you do believe there are no errors, consider increase the \'--beam\' argument for MFA.') 23 | for fn in missing: 24 | print(f' - {fn}') 25 | else: 26 | print('All alignments have been successfully generated.') 27 | 28 | 29 | if __name__ == '__main__': 30 | check_tg() 31 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/combine_tg.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import re 3 | from typing import Dict, List 4 | 5 | import click 6 | import librosa 7 | import natsort 8 | import numpy 9 | import soundfile 10 | import textgrid 11 | import tqdm 12 | 13 | 14 | def remove_suffix(string, suffix_pattern): 15 | match = re.search(f'{suffix_pattern}$', string) 16 | if not match: 17 | return string 18 | return string[:-len(match.group())] 19 | 20 | 21 | @click.command(help='Combine segmented 2-tier TextGrids and wavs into 3-tier TextGrids and long wavs') 22 | @click.option( 23 | '--wavs', required=True, 24 | help='Directory containing the segmented wav files' 25 | ) 26 | @click.option( 27 | '--tg', required=False, 28 | help='Directory containing the segmented TextGrid files (defaults to wav directory)' 29 | ) 30 | @click.option( 31 | '--out', required=True, 32 | help='Path to output directory for combined files' 33 | ) 34 | @click.option( 35 | '--suffix', required=False, default=r'_\d+', 36 | help='Filename suffix pattern for file combination' 37 | ) 38 | @click.option( 39 | '--wav_subtype', required=False, default='PCM_16', 40 | help='Wav subtype (defaults to PCM_16)' 41 | ) 42 | @click.option( 43 | '--overwrite', is_flag=True, 44 | help='Overwrite existing files' 45 | ) 46 | def combine_tg(wavs, tg, out, suffix, wav_subtype, overwrite): 47 | wav_path_in = pathlib.Path(wavs) 48 | tg_path_in = wav_path_in if tg is None else pathlib.Path(tg) 49 | del tg 50 | combined_path_out = pathlib.Path(out) 51 | combined_path_out.mkdir(parents=True, exist_ok=True) 52 | filelist: Dict[str, List[pathlib.Path]] = {} 53 | for tg_file in tg_path_in.glob('*.TextGrid'): 54 | stem = remove_suffix(tg_file.stem, suffix) 55 | if stem not in filelist: 56 | filelist[stem] = [tg_file] 57 | else: 58 | filelist[stem].append(tg_file) 59 | for name, files in tqdm.tqdm(sorted(filelist.items(), key=lambda kv: kv[0])): 60 | wav_segments = [] 61 | tg = textgrid.TextGrid() 62 | sentences_tier = textgrid.IntervalTier(name='sentences') 63 | words_tier = textgrid.IntervalTier(name='words') 64 | phones_tier = textgrid.IntervalTier(name='phones') 65 | sentence_start = 0. 66 | sr = None 67 | for tg_file in natsort.natsorted(files): 68 | wav_file = (wav_path_in / tg_file.name).with_suffix('.wav') 69 | waveform, sr_ = librosa.load(wav_file, sr=None) 70 | if sr is None: 71 | sr = sr_ 72 | else: 73 | assert sr_ == sr, f'Cannot combine \'{tg_file.stem}\': incompatible samplerate ({sr_} != {sr})' 74 | sentence_end = waveform.shape[0] / sr + sentence_start 75 | wav_segments.append(waveform) 76 | sentences_tier.add(minTime=sentence_start, maxTime=sentence_end, mark=wav_file.stem) 77 | sentence_tg = textgrid.TextGrid() 78 | sentence_tg.read(tg_file) 79 | start = sentence_start 80 | for j, word in enumerate(sentence_tg[0]): 81 | if j == len(sentence_tg[0]) - 1: 82 | end = sentence_end 83 | else: 84 | end = start + word.duration() 85 | words_tier.add(minTime=start, maxTime=end, mark=word.mark) 86 | start = end 87 | start = sentence_start 88 | for j, phone in enumerate(sentence_tg[1]): 89 | if j == len(sentence_tg[1]) - 1: 90 | end = sentence_end 91 | else: 92 | end = start + phone.duration() 93 | phones_tier.add(minTime=start, maxTime=end, mark=phone.mark) 94 | start = end 95 | sentence_start = sentence_end 96 | tg.append(sentences_tier) 97 | tg.append(words_tier) 98 | tg.append(phones_tier) 99 | 100 | tg_file_out = combined_path_out / f'{name}.TextGrid' 101 | wav_file_out = tg_file_out.with_suffix('.wav') 102 | if wav_file_out.exists() and not overwrite: 103 | raise FileExistsError(str(wav_file_out)) 104 | if tg_file_out.exists() and not overwrite: 105 | raise FileExistsError(str(tg_file_out)) 106 | 107 | tg.write(tg_file_out) 108 | full_wav = numpy.concatenate(wav_segments) 109 | soundfile.write(wav_file_out, full_wav, samplerate=sr, subtype=wav_subtype) 110 | 111 | 112 | if __name__ == '__main__': 113 | combine_tg() 114 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/dictionaries/opencpop-extension.txt: -------------------------------------------------------------------------------- 1 | a a 2 | ai ai 3 | an an 4 | ang ang 5 | ao ao 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b ang 10 | bao b ao 11 | be b e 12 | bei b ei 13 | ben b en 14 | beng b eng 15 | ber b er 16 | bi b i 17 | bia b ia 18 | bian b ian 19 | biang b iang 20 | biao b iao 21 | bie b ie 22 | bin b in 23 | bing b ing 24 | biong b iong 25 | biu b iu 26 | bo b o 27 | bong b ong 28 | bou b ou 29 | bu b u 30 | bua b ua 31 | buai b uai 32 | buan b uan 33 | buang b uang 34 | bui b ui 35 | bun b un 36 | bv b v 37 | bve b ve 38 | ca c a 39 | cai c ai 40 | can c an 41 | cang c ang 42 | cao c ao 43 | ce c e 44 | cei c ei 45 | cen c en 46 | ceng c eng 47 | cer c er 48 | cha ch a 49 | chai ch ai 50 | chan ch an 51 | chang ch ang 52 | chao ch ao 53 | che ch e 54 | chei ch ei 55 | chen ch en 56 | cheng ch eng 57 | cher ch er 58 | chi ch ir 59 | chong ch ong 60 | chou ch ou 61 | chu ch u 62 | chua ch ua 63 | chuai ch uai 64 | chuan ch uan 65 | chuang ch uang 66 | chui ch ui 67 | chun ch un 68 | chuo ch uo 69 | chv ch v 70 | chyi ch i 71 | ci c i0 72 | cong c ong 73 | cou c ou 74 | cu c u 75 | cua c ua 76 | cuai c uai 77 | cuan c uan 78 | cuang c uang 79 | cui c ui 80 | cun c un 81 | cuo c uo 82 | cv c v 83 | cyi c i 84 | da d a 85 | dai d ai 86 | dan d an 87 | dang d ang 88 | dao d ao 89 | de d e 90 | dei d ei 91 | den d en 92 | deng d eng 93 | der d er 94 | di d i 95 | dia d ia 96 | dian d ian 97 | diang d iang 98 | diao d iao 99 | die d ie 100 | din d in 101 | ding d ing 102 | diong d iong 103 | diu d iu 104 | dong d ong 105 | dou d ou 106 | du d u 107 | dua d ua 108 | duai d uai 109 | duan d uan 110 | duang d uang 111 | dui d ui 112 | dun d un 113 | duo d uo 114 | dv d v 115 | dve d ve 116 | e e 117 | ei ei 118 | en en 119 | eng eng 120 | er er 121 | fa f a 122 | fai f ai 123 | fan f an 124 | fang f ang 125 | fao f ao 126 | fe f e 127 | fei f ei 128 | fen f en 129 | feng f eng 130 | fer f er 131 | fi f i 132 | fia f ia 133 | fian f ian 134 | fiang f iang 135 | fiao f iao 136 | fie f ie 137 | fin f in 138 | fing f ing 139 | fiong f iong 140 | fiu f iu 141 | fo f o 142 | fong f ong 143 | fou f ou 144 | fu f u 145 | fua f ua 146 | fuai f uai 147 | fuan f uan 148 | fuang f uang 149 | fui f ui 150 | fun f un 151 | fv f v 152 | fve f ve 153 | ga g a 154 | gai g ai 155 | gan g an 156 | gang g ang 157 | gao g ao 158 | ge g e 159 | gei g ei 160 | gen g en 161 | geng g eng 162 | ger g er 163 | gi g i 164 | gia g ia 165 | gian g ian 166 | giang g iang 167 | giao g iao 168 | gie g ie 169 | gin g in 170 | ging g ing 171 | giong g iong 172 | giu g iu 173 | gong g ong 174 | gou g ou 175 | gu g u 176 | gua g ua 177 | guai g uai 178 | guan g uan 179 | guang g uang 180 | gui g ui 181 | gun g un 182 | guo g uo 183 | gv g v 184 | gve g ve 185 | ha h a 186 | hai h ai 187 | han h an 188 | hang h ang 189 | hao h ao 190 | he h e 191 | hei h ei 192 | hen h en 193 | heng h eng 194 | her h er 195 | hi h i 196 | hia h ia 197 | hian h ian 198 | hiang h iang 199 | hiao h iao 200 | hie h ie 201 | hin h in 202 | hing h ing 203 | hiong h iong 204 | hiu h iu 205 | hong h ong 206 | hou h ou 207 | hu h u 208 | hua h ua 209 | huai h uai 210 | huan h uan 211 | huang h uang 212 | hui h ui 213 | hun h un 214 | huo h uo 215 | hv h v 216 | hve h ve 217 | ji j i 218 | jia j ia 219 | jian j ian 220 | jiang j iang 221 | jiao j iao 222 | jie j ie 223 | jin j in 224 | jing j ing 225 | jiong j iong 226 | jiu j iu 227 | ju j v 228 | juan j van 229 | jue j ve 230 | jun j vn 231 | ka k a 232 | kai k ai 233 | kan k an 234 | kang k ang 235 | kao k ao 236 | ke k e 237 | kei k ei 238 | ken k en 239 | keng k eng 240 | ker k er 241 | ki k i 242 | kia k ia 243 | kian k ian 244 | kiang k iang 245 | kiao k iao 246 | kie k ie 247 | kin k in 248 | king k ing 249 | kiong k iong 250 | kiu k iu 251 | kong k ong 252 | kou k ou 253 | ku k u 254 | kua k ua 255 | kuai k uai 256 | kuan k uan 257 | kuang k uang 258 | kui k ui 259 | kun k un 260 | kuo k uo 261 | kv k v 262 | kve k ve 263 | la l a 264 | lai l ai 265 | lan l an 266 | lang l ang 267 | lao l ao 268 | le l e 269 | lei l ei 270 | len l en 271 | leng l eng 272 | ler l er 273 | li l i 274 | lia l ia 275 | lian l ian 276 | liang l iang 277 | liao l iao 278 | lie l ie 279 | lin l in 280 | ling l ing 281 | liong l iong 282 | liu l iu 283 | lo l o 284 | long l ong 285 | lou l ou 286 | lu l u 287 | lua l ua 288 | luai l uai 289 | luan l uan 290 | luang l uang 291 | lui l ui 292 | lun l un 293 | luo l uo 294 | lv l v 295 | lve l ve 296 | ma m a 297 | mai m ai 298 | man m an 299 | mang m ang 300 | mao m ao 301 | me m e 302 | mei m ei 303 | men m en 304 | meng m eng 305 | mer m er 306 | mi m i 307 | mia m ia 308 | mian m ian 309 | miang m iang 310 | miao m iao 311 | mie m ie 312 | min m in 313 | ming m ing 314 | miong m iong 315 | miu m iu 316 | mo m o 317 | mong m ong 318 | mou m ou 319 | mu m u 320 | mua m ua 321 | muai m uai 322 | muan m uan 323 | muang m uang 324 | mui m ui 325 | mun m un 326 | mv m v 327 | mve m ve 328 | na n a 329 | nai n ai 330 | nan n an 331 | nang n ang 332 | nao n ao 333 | ne n e 334 | nei n ei 335 | nen n en 336 | neng n eng 337 | ner n er 338 | ni n i 339 | nia n ia 340 | nian n ian 341 | niang n iang 342 | niao n iao 343 | nie n ie 344 | nin n in 345 | ning n ing 346 | niong n iong 347 | niu n iu 348 | nong n ong 349 | nou n ou 350 | nu n u 351 | nua n ua 352 | nuai n uai 353 | nuan n uan 354 | nuang n uang 355 | nui n ui 356 | nun n un 357 | nuo n uo 358 | nv n v 359 | nve n ve 360 | o o 361 | ong ong 362 | ou ou 363 | pa p a 364 | pai p ai 365 | pan p an 366 | pang p ang 367 | pao p ao 368 | pe p e 369 | pei p ei 370 | pen p en 371 | peng p eng 372 | per p er 373 | pi p i 374 | pia p ia 375 | pian p ian 376 | piang p iang 377 | piao p iao 378 | pie p ie 379 | pin p in 380 | ping p ing 381 | piong p iong 382 | piu p iu 383 | po p o 384 | pong p ong 385 | pou p ou 386 | pu p u 387 | pua p ua 388 | puai p uai 389 | puan p uan 390 | puang p uang 391 | pui p ui 392 | pun p un 393 | pv p v 394 | pve p ve 395 | qi q i 396 | qia q ia 397 | qian q ian 398 | qiang q iang 399 | qiao q iao 400 | qie q ie 401 | qin q in 402 | qing q ing 403 | qiong q iong 404 | qiu q iu 405 | qu q v 406 | quan q van 407 | que q ve 408 | qun q vn 409 | ra r a 410 | rai r ai 411 | ran r an 412 | rang r ang 413 | rao r ao 414 | re r e 415 | rei r ei 416 | ren r en 417 | reng r eng 418 | rer r er 419 | ri r ir 420 | rong r ong 421 | rou r ou 422 | ru r u 423 | rua r ua 424 | ruai r uai 425 | ruan r uan 426 | ruang r uang 427 | rui r ui 428 | run r un 429 | ruo r uo 430 | rv r v 431 | ryi r i 432 | sa s a 433 | sai s ai 434 | san s an 435 | sang s ang 436 | sao s ao 437 | se s e 438 | sei s ei 439 | sen s en 440 | seng s eng 441 | ser s er 442 | sha sh a 443 | shai sh ai 444 | shan sh an 445 | shang sh ang 446 | shao sh ao 447 | she sh e 448 | shei sh ei 449 | shen sh en 450 | sheng sh eng 451 | sher sh er 452 | shi sh ir 453 | shong sh ong 454 | shou sh ou 455 | shu sh u 456 | shua sh ua 457 | shuai sh uai 458 | shuan sh uan 459 | shuang sh uang 460 | shui sh ui 461 | shun sh un 462 | shuo sh uo 463 | shv sh v 464 | shyi sh i 465 | si s i0 466 | song s ong 467 | sou s ou 468 | su s u 469 | sua s ua 470 | suai s uai 471 | suan s uan 472 | suang s uang 473 | sui s ui 474 | sun s un 475 | suo s uo 476 | sv s v 477 | syi s i 478 | ta t a 479 | tai t ai 480 | tan t an 481 | tang t ang 482 | tao t ao 483 | te t e 484 | tei t ei 485 | ten t en 486 | teng t eng 487 | ter t er 488 | ti t i 489 | tia t ia 490 | tian t ian 491 | tiang t iang 492 | tiao t iao 493 | tie t ie 494 | tin t in 495 | ting t ing 496 | tiong t iong 497 | tong t ong 498 | tou t ou 499 | tu t u 500 | tua t ua 501 | tuai t uai 502 | tuan t uan 503 | tuang t uang 504 | tui t ui 505 | tun t un 506 | tuo t uo 507 | tv t v 508 | tve t ve 509 | wa w a 510 | wai w ai 511 | wan w an 512 | wang w ang 513 | wao w ao 514 | we w e 515 | wei w ei 516 | wen w en 517 | weng w eng 518 | wer w er 519 | wi w i 520 | wo w o 521 | wong w ong 522 | wou w ou 523 | wu w u 524 | xi x i 525 | xia x ia 526 | xian x ian 527 | xiang x iang 528 | xiao x iao 529 | xie x ie 530 | xin x in 531 | xing x ing 532 | xiong x iong 533 | xiu x iu 534 | xu x v 535 | xuan x van 536 | xue x ve 537 | xun x vn 538 | ya y a 539 | yai y ai 540 | yan y En 541 | yang y ang 542 | yao y ao 543 | ye y E 544 | yei y ei 545 | yi y i 546 | yin y in 547 | ying y ing 548 | yo y o 549 | yong y ong 550 | you y ou 551 | yu y v 552 | yuan y van 553 | yue y ve 554 | yun y vn 555 | ywu y u 556 | za z a 557 | zai z ai 558 | zan z an 559 | zang z ang 560 | zao z ao 561 | ze z e 562 | zei z ei 563 | zen z en 564 | zeng z eng 565 | zer z er 566 | zha zh a 567 | zhai zh ai 568 | zhan zh an 569 | zhang zh ang 570 | zhao zh ao 571 | zhe zh e 572 | zhei zh ei 573 | zhen zh en 574 | zheng zh eng 575 | zher zh er 576 | zhi zh ir 577 | zhong zh ong 578 | zhou zh ou 579 | zhu zh u 580 | zhua zh ua 581 | zhuai zh uai 582 | zhuan zh uan 583 | zhuang zh uang 584 | zhui zh ui 585 | zhun zh un 586 | zhuo zh uo 587 | zhv zh v 588 | zhyi zh i 589 | zi z i0 590 | zong z ong 591 | zou z ou 592 | zu z u 593 | zua z ua 594 | zuai z uai 595 | zuan z uan 596 | zuang z uang 597 | zui z ui 598 | zun z un 599 | zuo z uo 600 | zv z v 601 | zyi z i 602 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/distribution.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def draw_distribution(title, x_label, y_label, items: list, values: list, zoom=0.8): 5 | plt.figure(figsize=(int(len(items) * zoom), 10)) 6 | plt.bar(x=items, height=values) 7 | plt.tick_params(labelsize=15) 8 | plt.xlim(-1, len(items)) 9 | for a, b in zip(items, values): 10 | plt.text(a, b, b, ha='center', va='bottom', fontsize=15) 11 | plt.grid() 12 | plt.title(title, fontsize=30) 13 | plt.xlabel(x_label, fontsize=20) 14 | plt.ylabel(y_label, fontsize=20) 15 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/enhance_tg.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import click 4 | import librosa 5 | import numpy as np 6 | import parselmouth as pm 7 | import textgrid as tg 8 | import tqdm 9 | 10 | 11 | @click.command(help='Enhance and finish the TextGrids') 12 | @click.option('--wavs', required=True, help='Path to the segments directory') 13 | @click.option('--dictionary', required=True, help='Path to the dictionary file') 14 | @click.option('--src', required=True, help='Path to the raw TextGrids directory') 15 | @click.option('--dst', required=True, help='Path to the final TextGrids directory') 16 | @click.option('--f0_min', type=float, default=40., show_default=True, help='Minimum value of pitch') 17 | @click.option('--f0_max', type=float, default=1100., show_default=True, help='Maximum value of pitch') 18 | @click.option('--br_len', type=float, default=0.1, show_default=True, 19 | help='Minimum length of breath in seconds') 20 | @click.option('--br_db', type=float, default=-60., show_default=True, 21 | help='Threshold of RMS in dB for detecting breath') 22 | @click.option('--br_centroid', type=float, default=2000., show_default=True, 23 | help='Threshold of spectral centroid in Hz for detecting breath') 24 | @click.option('--time_step', type=float, default=0.005, show_default=True, 25 | help='Time step for feature extraction') 26 | @click.option('--min_space', type=float, default=0.04, show_default=True, 27 | help='Minimum length of space in seconds') 28 | @click.option('--voicing_thresh_vowel', type=float, default=0.45, show_default=True, 29 | help='Threshold of voicing for fixing long utterances') 30 | @click.option('--voicing_thresh_breath', type=float, default=0.6, show_default=True, 31 | help='Threshold of voicing for detecting breath') 32 | @click.option('--br_win_sz', type=float, default=0.05, show_default=True, 33 | help='Size of sliding window in seconds for detecting breath') 34 | def enhance_tg( 35 | wavs, dictionary, src, dst, 36 | f0_min, f0_max, br_len, br_db, br_centroid, 37 | time_step, min_space, voicing_thresh_vowel, voicing_thresh_breath, br_win_sz 38 | ): 39 | wavs = pathlib.Path(wavs) 40 | dict_path = pathlib.Path(dictionary) 41 | src = pathlib.Path(src) 42 | dst = pathlib.Path(dst) 43 | dst.mkdir(parents=True, exist_ok=True) 44 | 45 | with open(dict_path, 'r', encoding='utf8') as f: 46 | rules = [ln.strip().split('\t') for ln in f.readlines()] 47 | dictionary = {} 48 | phoneme_set = set() 49 | for r in rules: 50 | phonemes = r[1].split() 51 | dictionary[r[0]] = phonemes 52 | phoneme_set.update(phonemes) 53 | 54 | filelist = list(wavs.glob('*.wav')) 55 | for wavfile in tqdm.tqdm(filelist): 56 | tgfile = src / wavfile.with_suffix('.TextGrid').name 57 | textgrid = tg.TextGrid() 58 | textgrid.read(str(tgfile)) 59 | words = textgrid[0] 60 | phones = textgrid[1] 61 | sound = pm.Sound(str(wavfile)) 62 | f0_voicing_breath = sound.to_pitch_ac( 63 | time_step=time_step, 64 | voicing_threshold=voicing_thresh_breath, 65 | pitch_floor=f0_min, 66 | pitch_ceiling=f0_max, 67 | ).selected_array['frequency'] 68 | f0_voicing_vowel = sound.to_pitch_ac( 69 | time_step=time_step, 70 | voicing_threshold=voicing_thresh_vowel, 71 | pitch_floor=f0_min, 72 | pitch_ceiling=f0_max, 73 | ).selected_array['frequency'] 74 | y, sr = librosa.load(wavfile, sr=24000, mono=True) 75 | hop_size = int(time_step * sr) 76 | spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=2048, hop_length=hop_size).squeeze(0) 77 | 78 | # Fix long utterances 79 | i = j = 0 80 | while i < len(words): 81 | word = words[i] 82 | phone = phones[j] 83 | if word.mark is not None and word.mark != '': 84 | i += 1 85 | j += len(dictionary[word.mark]) 86 | continue 87 | if i == 0: 88 | i += 1 89 | j += 1 90 | continue 91 | prev_word = words[i - 1] 92 | prev_phone = phones[j - 1] 93 | # Extend length of long utterances 94 | while word.minTime < word.maxTime - time_step: 95 | pos = min(f0_voicing_vowel.shape[0] - 1, int(word.minTime / time_step)) 96 | if f0_voicing_vowel[pos] < f0_min: 97 | break 98 | prev_word.maxTime += time_step 99 | prev_phone.maxTime += time_step 100 | word.minTime += time_step 101 | phone.minTime += time_step 102 | i += 1 103 | j += 1 104 | 105 | # Detect aspiration 106 | i = j = 0 107 | while i < len(words): 108 | word = words[i] 109 | phone = phones[j] 110 | if word.mark is not None and word.mark != '': 111 | i += 1 112 | j += len(dictionary[word.mark]) 113 | continue 114 | if word.maxTime - word.minTime < br_len: 115 | i += 1 116 | j += 1 117 | continue 118 | ap_ranges = [] 119 | br_start = None 120 | win_pos = word.minTime 121 | while win_pos + br_win_sz <= word.maxTime: 122 | all_noisy = (f0_voicing_breath[ 123 | int(win_pos / time_step): int((win_pos + br_win_sz) / time_step)] < f0_min).all() 124 | rms_db = 20 * np.log10( 125 | np.clip(sound.get_rms(from_time=win_pos, to_time=win_pos + br_win_sz), a_min=1e-12, a_max=1)) 126 | # print(win_pos, win_pos + br_win_sz, all_noisy, rms_db) 127 | if all_noisy and rms_db >= br_db: 128 | if br_start is None: 129 | br_start = win_pos 130 | else: 131 | if br_start is not None: 132 | br_end = win_pos + br_win_sz - time_step 133 | if br_end - br_start >= br_len: 134 | centroid = spectral_centroid[int(br_start / time_step): int(br_end / time_step)].mean() 135 | if centroid >= br_centroid: 136 | ap_ranges.append((br_start, br_end)) 137 | br_start = None 138 | win_pos = br_end 139 | win_pos += time_step 140 | if br_start is not None: 141 | br_end = win_pos + br_win_sz - time_step 142 | if br_end - br_start >= br_len: 143 | centroid = spectral_centroid[int(br_start / time_step): int(br_end / time_step)].mean() 144 | if centroid >= br_centroid: 145 | ap_ranges.append((br_start, br_end)) 146 | # print(ap_ranges) 147 | if len(ap_ranges) == 0: 148 | i += 1 149 | j += 1 150 | continue 151 | words.removeInterval(word) 152 | phones.removeInterval(phone) 153 | if word.minTime < ap_ranges[0][0]: 154 | words.add(minTime=word.minTime, maxTime=ap_ranges[0][0], mark=None) 155 | phones.add(minTime=phone.minTime, maxTime=ap_ranges[0][0], mark=None) 156 | i += 1 157 | j += 1 158 | for k, ap in enumerate(ap_ranges): 159 | if k > 0: 160 | words.add(minTime=ap_ranges[k - 1][1], maxTime=ap[0], mark=None) 161 | phones.add(minTime=ap_ranges[k - 1][1], maxTime=ap[0], mark=None) 162 | i += 1 163 | j += 1 164 | words.add(minTime=ap[0], maxTime=min(word.maxTime, ap[1]), mark='AP') 165 | phones.add(minTime=ap[0], maxTime=min(word.maxTime, ap[1]), mark='AP') 166 | i += 1 167 | j += 1 168 | if ap_ranges[-1][1] < word.maxTime: 169 | words.add(minTime=ap_ranges[-1][1], maxTime=word.maxTime, mark=None) 170 | phones.add(minTime=ap_ranges[-1][1], maxTime=phone.maxTime, mark=None) 171 | i += 1 172 | j += 1 173 | 174 | # Remove short spaces 175 | i = j = 0 176 | while i < len(words): 177 | word = words[i] 178 | phone = phones[j] 179 | if word.mark is not None and word.mark != '': 180 | i += 1 181 | j += (1 if word.mark == 'AP' else len(dictionary[word.mark])) 182 | continue 183 | if word.maxTime - word.minTime >= min_space: 184 | word.mark = 'SP' 185 | phone.mark = 'SP' 186 | i += 1 187 | j += 1 188 | continue 189 | if i == 0: 190 | if len(words) >= 2: 191 | words[i + 1].minTime = word.minTime 192 | phones[j + 1].minTime = phone.minTime 193 | words.removeInterval(word) 194 | phones.removeInterval(phone) 195 | else: 196 | break 197 | elif i == len(words) - 1: 198 | if len(words) >= 2: 199 | words[i - 1].maxTime = word.maxTime 200 | phones[j - 1].maxTime = phone.maxTime 201 | words.removeInterval(word) 202 | phones.removeInterval(phone) 203 | else: 204 | break 205 | else: 206 | words[i - 1].maxTime = words[i + 1].minTime = (word.minTime + word.maxTime) / 2 207 | phones[j - 1].maxTime = phones[j + 1].minTime = (phone.minTime + phone.maxTime) / 2 208 | words.removeInterval(word) 209 | phones.removeInterval(phone) 210 | textgrid.write(str(dst / tgfile.name)) 211 | 212 | 213 | if __name__ == '__main__': 214 | enhance_tg() 215 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/reformat_wavs.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import shutil 3 | 4 | import click 5 | import librosa 6 | import numpy as np 7 | import soundfile 8 | import tqdm 9 | 10 | 11 | @click.command(help='Reformat the WAV files to 16kHz, 16bit PCM mono format and copy labels') 12 | @click.option('--src', required=True, help='Source segments directory') 13 | @click.option('--dst', required=True, help='Target segments directory') 14 | @click.option( 15 | '--normalize', 16 | is_flag=True, show_default=True, default=False, 17 | help='Normalize the audio (peak calculated over all segments)' 18 | ) 19 | def reformat_wavs(src, dst, normalize): 20 | src = pathlib.Path(src).resolve() 21 | dst = pathlib.Path(dst).resolve() 22 | assert src != dst, 'src and dst should not be the same path' 23 | assert src.is_dir() and (not dst.exists() or dst.is_dir()), 'src and dst must be directories' 24 | dst.mkdir(parents=True, exist_ok=True) 25 | samplerate = 16000 26 | filelist = list(src.glob('*.wav')) 27 | max_y = 1.0 28 | if normalize: 29 | max_y = 0.0 30 | for file in tqdm.tqdm(filelist): 31 | y, _ = librosa.load(file, sr=samplerate, mono=True) 32 | max_y = max(max_y, np.max(np.abs(y))) 33 | max_y += 0.01 34 | for file in tqdm.tqdm(filelist): 35 | y, _ = librosa.load(file, sr=samplerate, mono=True) 36 | soundfile.write((dst / file.name), y / max_y, samplerate, subtype='PCM_16') 37 | annotation = file.with_suffix('.lab') 38 | shutil.copy(annotation, dst) 39 | print('Reformatting and copying done.') 40 | 41 | 42 | if __name__ == '__main__': 43 | reformat_wavs() 44 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/requirements.txt: -------------------------------------------------------------------------------- 1 | biopython==1.78 2 | click 3 | librosa<0.10.0 4 | matplotlib 5 | praatio<6.0.0 6 | praat-parselmouth 7 | pyyaml 8 | soundfile 9 | sox 10 | sqlalchemy==1.4.46 11 | textgrid 12 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/select_test_set.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import random 3 | from collections import defaultdict 4 | from pathlib import Path 5 | 6 | import click 7 | import yaml 8 | 9 | 10 | # noinspection PyShadowingBuiltins 11 | @click.command(help='Randomly select test samples') 12 | @click.argument( 13 | 'config', 14 | type=click.Path(file_okay=True, dir_okay=False, resolve_path=True, writable=True, path_type=Path), 15 | metavar="CONFIG" 16 | ) 17 | @click.option( 18 | '--rel_path', 19 | type=click.Path(file_okay=False, dir_okay=True, resolve_path=True, path_type=Path), 20 | default=None, 21 | help='Path that is relative to the paths mentioned in the config file.' 22 | ) 23 | @click.option( 24 | '--min', '_min', 25 | show_default=True, 26 | type=click.IntRange(min=1), 27 | default=10, 28 | help='Minimum number of test samples.' 29 | ) 30 | @click.option( 31 | '--max', '_max', 32 | show_default=True, 33 | type=click.IntRange(min=1), 34 | default=20, 35 | help='Maximum number of test samples (note that each speaker will have at least one test sample).' 36 | ) 37 | @click.option( 38 | '--per_speaker', 39 | show_default=True, 40 | type=click.IntRange(min=1), 41 | default=4, 42 | help='Expected number of test samples per speaker.' 43 | ) 44 | def select_test_set(config, rel_path, _min, _max, per_speaker): 45 | assert _min <= _max, 'min must be smaller or equal to max' 46 | with open(config, 'r', encoding='utf8') as f: 47 | hparams = yaml.safe_load(f) 48 | 49 | spk_map = None 50 | spk_ids = hparams['spk_ids'] 51 | speakers = hparams['speakers'] 52 | raw_data_dirs = list(map(Path, hparams['raw_data_dir'])) 53 | assert isinstance(speakers, list), 'Speakers must be a list' 54 | assert len(speakers) == len(raw_data_dirs), \ 55 | 'Number of raw data dirs must equal number of speaker names!' 56 | if not spk_ids: 57 | spk_ids = list(range(len(raw_data_dirs))) 58 | else: 59 | assert len(spk_ids) == len(raw_data_dirs), \ 60 | 'Length of explicitly given spk_ids must equal the number of raw datasets.' 61 | assert max(spk_ids) < hparams['num_spk'], \ 62 | f'Index in spk_id sequence {spk_ids} is out of range. All values should be smaller than num_spk.' 63 | 64 | spk_map = {} 65 | path_spk_map = defaultdict(list) 66 | for ds_id, (spk_name, raw_path, spk_id) in enumerate(zip(speakers, raw_data_dirs, spk_ids)): 67 | if spk_name in spk_map and spk_map[spk_name] != spk_id: 68 | raise ValueError(f'Invalid speaker ID assignment. Name \'{spk_name}\' is assigned ' 69 | f'with different speaker IDs: {spk_map[spk_name]} and {spk_id}.') 70 | spk_map[spk_name] = spk_id 71 | path_spk_map[spk_id].append((ds_id, rel_path / raw_path if rel_path else raw_path)) 72 | 73 | training_cases = [] 74 | for spk_raw_dirs in path_spk_map.values(): 75 | training_case = [] 76 | # training cases from the same speaker are grouped together 77 | for ds_id, raw_data_dir in spk_raw_dirs: 78 | with open(raw_data_dir / 'transcriptions.csv', 'r', encoding='utf8') as f: 79 | reader = csv.DictReader(f) 80 | for row in reader: 81 | if (raw_data_dir / 'wavs' / f'{row["name"]}.wav').exists(): 82 | training_case.append(f'{ds_id}:{row["name"]}') 83 | training_cases.append(training_case) 84 | 85 | test_prefixes = [] 86 | total = min(_max, max(_min, per_speaker * len(training_cases))) 87 | quotient, remainder = total // len(training_cases), total % len(training_cases) 88 | if quotient == 0: 89 | test_counts = [1] * len(training_cases) 90 | else: 91 | test_counts = [quotient + 1] * remainder + [quotient] * (len(training_cases) - remainder) 92 | for i, count in enumerate(test_counts): 93 | test_prefixes += sorted(random.sample(training_cases[i], count)) 94 | if not hparams['test_prefixes'] or click.confirm('Overwrite existing test prefixes?', abort=False): 95 | hparams['test_prefixes'] = test_prefixes 96 | hparams['num_valid_plots'] = len(test_prefixes) 97 | with open(config, 'w', encoding='utf8') as f: 98 | yaml.dump(hparams, f, sort_keys=False) 99 | print('Test prefixes saved.') 100 | else: 101 | print('Test prefixes not saved, aborted.') 102 | 103 | if __name__ == '__main__': 104 | select_test_set() 105 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/slice_tg.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import click 4 | import librosa 5 | import soundfile 6 | import textgrid 7 | import tqdm 8 | 9 | 10 | @click.command(help='Slice 3-tier TextGrids and long recordings into segmented 2-tier TextGrids and wavs') 11 | @click.option( 12 | '--wavs', required=True, 13 | help='Directory containing the segmented wav files' 14 | ) 15 | @click.option( 16 | '--tg', required=False, 17 | help='Directory containing the segmented TextGrid files (defaults to wav directory)' 18 | ) 19 | @click.option( 20 | '--out', required=True, 21 | help='Path to output directory for combined files' 22 | ) 23 | @click.option( 24 | '--preserve_sentence_names', is_flag=True, 25 | help='Whether to use sentence marks as filenames (will be re-numbered by default)' 26 | ) 27 | @click.option( 28 | '--digits', required=False, type=int, default=3, 29 | help='Number of suffix digits (defaults to 3, will be padded with zeros on the left)' 30 | ) 31 | @click.option( 32 | '--wav_subtype', required=False, default='PCM_16', 33 | help='Wav subtype (defaults to PCM_16)' 34 | ) 35 | @click.option( 36 | '--overwrite', is_flag=True, 37 | help='Overwrite existing files' 38 | ) 39 | def slice_tg(wavs, tg, out, preserve_sentence_names, digits, wav_subtype, overwrite): 40 | wav_path_in = pathlib.Path(wavs) 41 | tg_path_in = wav_path_in if tg is None else pathlib.Path(tg) 42 | del tg 43 | sliced_path_out = pathlib.Path(out) 44 | sliced_path_out.mkdir(parents=True, exist_ok=True) 45 | for tg_file in tqdm.tqdm(tg_path_in.glob('*.TextGrid')): 46 | tg = textgrid.TextGrid() 47 | tg.read(tg_file) 48 | wav, sr = librosa.load((wav_path_in / tg_file.name).with_suffix('.wav'), sr=None) 49 | sentences_tier = tg[0] 50 | words_tier = tg[1] 51 | phones_tier = tg[2] 52 | idx = 0 53 | for sentence in sentences_tier: 54 | if sentence.mark == '': 55 | continue 56 | sentence_tg = textgrid.TextGrid() 57 | sentence_words_tier = textgrid.IntervalTier(name='words') 58 | sentence_phones_tier = textgrid.IntervalTier(name='phones') 59 | for word in words_tier: 60 | min_time = max(sentence.minTime, word.minTime) 61 | max_time = min(sentence.maxTime, word.maxTime) 62 | if min_time >= max_time: 63 | continue 64 | sentence_words_tier.add( 65 | minTime=min_time - sentence.minTime, maxTime=max_time - sentence.minTime, mark=word.mark 66 | ) 67 | for phone in phones_tier: 68 | min_time = max(sentence.minTime, phone.minTime) 69 | max_time = min(sentence.maxTime, phone.maxTime) 70 | if min_time >= max_time: 71 | continue 72 | sentence_phones_tier.add( 73 | minTime=min_time - sentence.minTime, maxTime=max_time - sentence.minTime, mark=phone.mark 74 | ) 75 | sentence_tg.append(sentence_words_tier) 76 | sentence_tg.append(sentence_phones_tier) 77 | 78 | if preserve_sentence_names: 79 | tg_file_out = sliced_path_out / f'{sentence.mark}.TextGrid' 80 | wav_file_out = tg_file_out.with_suffix('.wav') 81 | else: 82 | tg_file_out = sliced_path_out / f'{tg_file.stem}_{str(idx).zfill(digits)}.TextGrid' 83 | wav_file_out = tg_file_out.with_suffix('.wav') 84 | if tg_file_out.exists() and not overwrite: 85 | raise FileExistsError(str(tg_file_out)) 86 | if wav_file_out.exists() and not overwrite: 87 | raise FileExistsError(str(wav_file_out)) 88 | 89 | sentence_tg.write(tg_file_out) 90 | sentence_wav = wav[int(sentence.minTime * sr): min(wav.shape[0], int(sentence.maxTime * sr) + 1)] 91 | soundfile.write( 92 | wav_file_out, 93 | sentence_wav, samplerate=sr, subtype=wav_subtype 94 | ) 95 | idx += 1 96 | 97 | 98 | if __name__ == '__main__': 99 | slice_tg() 100 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/summary_pitch.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import click 4 | import librosa 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import parselmouth as pm 8 | import tqdm 9 | from textgrid import TextGrid 10 | 11 | import distribution 12 | 13 | 14 | @click.command(help='Generate word-level pitch summary') 15 | @click.option('--wavs', required=True, help='Path to the segments directory') 16 | @click.option('--tg', required=True, help='Path to the TextGrids directory') 17 | def summary_pitch(wavs, tg): 18 | wavs = pathlib.Path(wavs) 19 | tg_dir = pathlib.Path(tg) 20 | del tg 21 | filelist = list(wavs.glob('*.wav')) 22 | 23 | pit_map = {} 24 | f0_min = 40. 25 | f0_max = 1100. 26 | voicing_thresh_vowel = 0.45 27 | for wavfile in tqdm.tqdm(filelist): 28 | tg = TextGrid() 29 | tg.read(tg_dir / wavfile.with_suffix('.TextGrid').name) 30 | timestep = 0.01 31 | f0 = pm.Sound(str(wavfile)).to_pitch_ac( 32 | time_step=timestep, 33 | voicing_threshold=voicing_thresh_vowel, 34 | pitch_floor=f0_min, 35 | pitch_ceiling=f0_max, 36 | ).selected_array['frequency'] 37 | pitch = 12. * np.log2(f0 / 440.) + 69. 38 | for word in tg[0]: 39 | if word.mark in ['AP', 'SP']: 40 | continue 41 | if word.maxTime - word.minTime < timestep: 42 | continue 43 | word_pit = pitch[int(word.minTime / timestep): int(word.maxTime / timestep)] 44 | word_pit = np.extract(word_pit >= 0, word_pit) 45 | if word_pit.shape[0] == 0: 46 | continue 47 | counts = np.bincount(word_pit.astype(np.int64)) 48 | midi = counts.argmax() 49 | if midi in pit_map: 50 | pit_map[midi] += 1 51 | else: 52 | pit_map[midi] = 1 53 | midi_keys = sorted(pit_map.keys()) 54 | midi_keys = list(range(midi_keys[0], midi_keys[-1] + 1)) 55 | distribution.draw_distribution( 56 | title='Pitch Distribution Summary', 57 | x_label='Pitch', 58 | y_label='Number of occurrences', 59 | items=[librosa.midi_to_note(k) for k in midi_keys], 60 | values=[pit_map.get(k, 0) for k in midi_keys] 61 | ) 62 | pitch_summary = wavs / 'pitch_distribution.jpg' 63 | plt.savefig(fname=pitch_summary, 64 | bbox_inches='tight', 65 | pad_inches=0.25) 66 | print(f'Pitch distribution summary saved to {pitch_summary}') 67 | 68 | 69 | if __name__ == '__main__': 70 | summary_pitch() 71 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/validate_labels.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import click 4 | import matplotlib.pyplot as plt 5 | import tqdm 6 | 7 | import distribution 8 | 9 | 10 | # noinspection PyShadowingBuiltins 11 | @click.command(help='Validate transcription labels') 12 | @click.option('--dir', required=True, help='Path to the segments directory') 13 | @click.option('--dictionary', required=True, help='Path to the dictionary file') 14 | def validate_labels(dir, dictionary): 15 | # Load dictionary 16 | dict_path = pathlib.Path(dictionary) 17 | with open(dict_path, 'r', encoding='utf8') as f: 18 | rules = [ln.strip().split('\t') for ln in f.readlines()] 19 | dictionary = {} 20 | phoneme_set = set() 21 | for r in rules: 22 | phonemes = r[1].split() 23 | dictionary[r[0]] = phonemes 24 | phoneme_set.update(phonemes) 25 | 26 | # Run checks 27 | check_failed = False 28 | covered = set() 29 | phoneme_map = {} 30 | for ph in sorted(phoneme_set): 31 | phoneme_map[ph] = 0 32 | 33 | segments_dir = pathlib.Path(dir) 34 | filelist = list(segments_dir.glob('*.wav')) 35 | 36 | for file in tqdm.tqdm(filelist): 37 | filename = file.stem 38 | annotation = file.with_suffix('.lab') 39 | if not annotation.exists(): 40 | print(f'No annotation found for \'{filename}\'!') 41 | check_failed = True 42 | continue 43 | with open(annotation, 'r', encoding='utf8') as f: 44 | syllables = f.read().strip().split() 45 | if not syllables: 46 | print(f'Annotation file \'{annotation}\' is empty!') 47 | check_failed = True 48 | else: 49 | oov = [] 50 | for s in syllables: 51 | if s not in dictionary: 52 | oov.append(s) 53 | else: 54 | for ph in dictionary[s]: 55 | phoneme_map[ph] += 1 56 | covered.update(dictionary[s]) 57 | if oov: 58 | print(f'Syllable(s) {oov} not allowed in annotation file \'{annotation}\'') 59 | check_failed = True 60 | 61 | # Phoneme coverage 62 | uncovered = phoneme_set - covered 63 | if uncovered: 64 | print(f'The following phonemes are not covered!') 65 | print(sorted(uncovered)) 66 | print('Please add more recordings to cover these phonemes.') 67 | check_failed = True 68 | 69 | if not check_failed: 70 | print('All annotations are well prepared.') 71 | 72 | phoneme_list = sorted(phoneme_set) 73 | phoneme_counts = [phoneme_map[ph] for ph in phoneme_list] 74 | distribution.draw_distribution( 75 | title='Phoneme Distribution Summary', 76 | x_label='Phoneme', 77 | y_label='Number of occurrences', 78 | items=phoneme_list, 79 | values=phoneme_counts 80 | ) 81 | phoneme_summary = segments_dir / 'phoneme_distribution.jpg' 82 | plt.savefig(fname=phoneme_summary, 83 | bbox_inches='tight', 84 | pad_inches=0.25) 85 | print(f'Phoneme distribution summary saved to {phoneme_summary}') 86 | 87 | 88 | if __name__ == '__main__': 89 | validate_labels() 90 | -------------------------------------------------------------------------------- /acoustic_forced_alignment/validate_lengths.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import tqdm 3 | import os 4 | import pathlib 5 | 6 | import click 7 | 8 | 9 | def length(src: str): 10 | if os.path.isfile(src) and src.endswith('.wav'): 11 | return librosa.get_duration(filename=src) / 3600. 12 | elif os.path.isdir(src): 13 | total = 0 14 | for ch in [os.path.join(src, c) for c in os.listdir(src)]: 15 | total += length(ch) 16 | return total 17 | return 0 18 | 19 | 20 | # noinspection PyShadowingBuiltins 21 | @click.command(help='Validate segment lengths') 22 | @click.option('--dir', required=True, help='Path to the segments directory') 23 | def validate_lengths(dir): 24 | dir = pathlib.Path(dir) 25 | assert dir.exists() and dir.is_dir(), 'The chosen path does not exist or is not a directory.' 26 | 27 | reported = False 28 | filelist = list(dir.glob('*.wav')) 29 | total_length = 0. 30 | for file in tqdm.tqdm(filelist): 31 | wave_seconds = librosa.get_duration(filename=str(file)) 32 | if wave_seconds < 2.: 33 | reported = True 34 | print(f'Too short! \'{file}\' has a length of {round(wave_seconds, 1)} seconds!') 35 | if wave_seconds > 20.: 36 | reported = True 37 | print(f'Too long! \'{file}\' has a length of {round(wave_seconds, 1)} seconds!') 38 | total_length += wave_seconds / 3600. 39 | 40 | print(f'Found {len(filelist)} segments with total length of {round(total_length, 2)} hours.') 41 | 42 | if not reported: 43 | print('All segments have proper length.') 44 | 45 | 46 | if __name__ == '__main__': 47 | validate_lengths() 48 | -------------------------------------------------------------------------------- /midi-recognition/README.md: -------------------------------------------------------------------------------- 1 | # MIDI Recognition 2 | 3 | ## 1. merge_wavs.py 4 | 5 | Merge short audio clips into long audio segments of similar length (e.g. 4 min) and a fixed sampling rate (e.g. 16000) and save the timestamps into tags.json. 6 | 7 | ## 2. extract_midi.py 8 | 9 | Extract MIDI sequences from of OpenSVIP json files, split them back into short clips according to tags.json, and add them into transcriptions.csv. 10 | 11 | -------------------------------------------------------------------------------- /midi-recognition/extract_midi.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import pathlib 4 | 5 | import click 6 | import librosa 7 | from typing import List, Tuple 8 | 9 | 10 | @click.command(help='Extract MIDI sequences from OpenSVIP json files and add them into transcriptions.csv') 11 | @click.argument('json_dir', metavar='JSONS') 12 | @click.argument('csv_file', metavar='TRANSCRIPTIONS') 13 | @click.option('--key', type=int, default=0, show_default=True, 14 | metavar='SEMITONES', help='Key transition') 15 | def extract_midi(json_dir, csv_file, key): 16 | json_dir = pathlib.Path(json_dir).resolve() 17 | assert json_dir.exists(), 'The json directory does not exist.' 18 | tags_file = json_dir / 'tags.json' 19 | assert tags_file.exists(), 'The tags.json does not exist.' 20 | csv_file = pathlib.Path(csv_file).resolve() 21 | assert csv_file.resolve(), 'The path to transcriptions.csv does not exist.' 22 | tol = 0.001 23 | 24 | with open(tags_file, 'r', encoding='utf8') as f: 25 | tags: dict = json.load(f) 26 | 27 | # Read MIDI sequences 28 | note_seq_map: dict = {} # key: merged filename, value: note sequence 29 | for json_file in json_dir.iterdir(): 30 | if json_file.stem not in tags or not json_file.is_file() or json_file.suffix != '.json': 31 | continue 32 | with open(json_file, 'r', encoding='utf8') as f: 33 | json_obj: dict = json.load(f) 34 | assert len(json_obj['SongTempoList']) == 1, \ 35 | f'[ERROR] {json_file.name}: there must be one and only one single tempo in the project.' 36 | 37 | tempo = json_obj['SongTempoList'][0]['BPM'] 38 | midi_seq: list = json_obj['TrackList'][0]['NoteList'] 39 | note_seq: List[Tuple[str, float]] = [] # (note, duration) 40 | prev_pos: int = 0 # in ticks 41 | for i, midi in enumerate(midi_seq): 42 | if prev_pos < midi['StartPos']: 43 | note_seq.append( 44 | ('rest', (midi['StartPos'] - prev_pos) / 8 / tempo) 45 | ) 46 | note_seq.append( 47 | (librosa.midi_to_note(midi['KeyNumber'] + key, unicode=False), midi['Length'] / 8 / tempo) 48 | ) 49 | prev_pos = midi['StartPos'] + midi['Length'] 50 | remain_secs = prev_pos / 8 / tempo - sum(t['duration'] for t in tags[json_file.stem]) 51 | if remain_secs > tol: 52 | note_seq.append( 53 | ('rest', remain_secs) 54 | ) 55 | note_seq_map[json_file.stem] = note_seq 56 | 57 | # Load transcriptions 58 | transcriptions: dict = {} # key: split filename, value: attr dict 59 | with open(csv_file, 'r', encoding='utf8') as f: 60 | reader = csv.DictReader(f) 61 | for attrs in reader: 62 | transcriptions[attrs['name']] = attrs 63 | 64 | # Split note sequence and add into transcriptions 65 | for merged_name, note_seq in note_seq_map.items(): 66 | note_seq: Tuple[str, float] 67 | idx = 0 68 | offset = 0. 69 | cur_note_secs = 0. 70 | cur_clip_secs = 0. 71 | for split_tag in tags[merged_name]: 72 | split_note_seq = [] 73 | while idx < len(note_seq): 74 | cur_note_dur = note_seq[idx][1] - offset 75 | if cur_note_secs + cur_note_dur <= cur_clip_secs + split_tag['duration']: 76 | split_note_seq.append( 77 | (note_seq[idx][0], cur_note_dur) 78 | ) 79 | idx += 1 80 | cur_note_secs += cur_note_dur 81 | offset = 0. 82 | else: 83 | offset = cur_clip_secs + split_tag['duration'] - cur_note_secs 84 | cur_note_secs += offset 85 | cur_clip_secs += split_tag['duration'] 86 | split_note_seq.append( 87 | (note_seq[idx][0], offset) 88 | ) 89 | break 90 | if idx == len(note_seq) and cur_clip_secs + split_tag['duration'] - cur_note_secs >= tol: 91 | split_note_seq.append( 92 | ('rest', cur_clip_secs + split_tag['duration'] - cur_note_secs) 93 | ) 94 | if split_tag['filename'] not in transcriptions: 95 | continue 96 | dst_dict = transcriptions[split_tag['filename']] 97 | dst_dict['note_seq'] = ' '.join(n[0] for n in split_note_seq) 98 | dst_dict['note_dur'] = ' '.join(str(n[1]) for n in split_note_seq) 99 | 100 | with open(csv_file, 'w', encoding='utf8', newline='') as f: 101 | writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num', 'note_seq', 'note_dur']) 102 | writer.writeheader() 103 | writer.writerows(v for _, v in transcriptions.items()) 104 | 105 | 106 | if __name__ == '__main__': 107 | extract_midi() 108 | -------------------------------------------------------------------------------- /midi-recognition/merge_wavs.py: -------------------------------------------------------------------------------- 1 | import tqdm 2 | import json 3 | import pathlib 4 | from collections import OrderedDict 5 | 6 | import click 7 | import librosa 8 | import numpy as np 9 | import soundfile 10 | 11 | 12 | @click.command(help='Merge clips into segments of similar length') 13 | @click.argument('input_wavs', metavar='INPUT_WAVS') 14 | @click.argument('output_wavs', metavar='OUTPUT_WAVS') 15 | @click.option('--length', type=int, required=False, default=240, metavar='SECONDS') 16 | @click.option('--sr', type=int, required=False, default=16000) 17 | def merge_wavs( 18 | input_wavs, output_wavs, length, sr 19 | ): 20 | input_wavs = pathlib.Path(input_wavs).resolve() 21 | assert input_wavs.exists(), 'The input directory does not exist.' 22 | output_wavs = pathlib.Path(output_wavs).resolve() 23 | assert not output_wavs.exists() or all(False for _ in output_wavs.iterdir()), \ 24 | 'The output directory is not empty.' 25 | 26 | output_wavs.mkdir(parents=True, exist_ok=True) 27 | tags = OrderedDict() 28 | count = 0 29 | cache: list[tuple[str, np.ndarray]] = [] 30 | cache_len = 0. 31 | 32 | def save_cache(): 33 | nonlocal tags, count, cache, cache_len 34 | waveform_merged = np.concatenate(tuple(c[1] for c in cache)) 35 | filename = (output_wavs / str(count).zfill(8)).with_suffix('.wav') 36 | soundfile.write( 37 | str(filename), 38 | waveform_merged, sr, format='WAV' 39 | ) 40 | tags[str(filename.stem)] = [ 41 | { 42 | 'filename': c[0], 43 | 'duration': c[1].shape[0] / sr 44 | } 45 | for c in cache 46 | ] 47 | cache.clear() 48 | cache_len = 0. 49 | count += 1 50 | 51 | for wav in tqdm.tqdm(input_wavs.iterdir()): 52 | if not wav.is_file() or wav.suffix != '.wav': 53 | continue 54 | y, _ = librosa.load(wav, sr=sr, mono=True) 55 | cur_len = y.shape[0] / sr 56 | if len(cache) > 0 and cache_len + cur_len >= length: 57 | save_cache() 58 | cache.append((wav.stem, y)) 59 | cache_len += cur_len 60 | if len(cache) > 0: 61 | save_cache() 62 | 63 | tags_path = output_wavs / 'tags.json' 64 | with open(tags_path, 'w', encoding='utf8') as f: 65 | json.dump(tags, f, ensure_ascii=False, indent=2) 66 | print(f'Timestamps saved to {tags_path}') 67 | 68 | 69 | if __name__ == '__main__': 70 | merge_wavs() 71 | -------------------------------------------------------------------------------- /variance-temp-solution/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | __pycache__/ 4 | *.sh 5 | local_tools/ 6 | /venv/ 7 | 8 | .vscode 9 | .ipynb_checkpoints/ 10 | 11 | assets/* 12 | !assets/.gitkeep 13 | -------------------------------------------------------------------------------- /variance-temp-solution/README.md: -------------------------------------------------------------------------------- 1 | # Making variance datasets (temporary solution) 2 | 3 | This pipeline will guide you to migrate your old DiffSinger datasets to the new and complete format for both acoustic and variance model training. 4 | 5 | ## 1. Clone repo and install dependencies 6 | 7 | ```bash 8 | git clone https://github.com/openvpi/MakeDiffSinger.git 9 | cd MakeDiffSinger/variance-temp-solution 10 | pip install -r requirements.txt # or you can reuse a pre-existing DiffSinger environment 11 | ``` 12 | 13 | ## 2. Convert transcriptions 14 | 15 | Assume you have a DiffSinger dataset which contains a transcriptions.txt file. 16 | 17 | Run: 18 | 19 | ```bash 20 | python convert_txt.py path/to/your/transcriptions.txt 21 | ``` 22 | 23 | This will generate transcriptions.csv in the same folder as transcriptions.txt, which has three attributes: `name`, `ph_seq` and `ph_dur`. 24 | 25 | ## 3. Add `ph_num` attribute 26 | 27 | The attribute `ph_num` is needed for training the variance models especially if you need to train the phoneme duration predictor. This attribute represents the number of phones that each word contains. 28 | 29 | In singing, vowels, instead of consonants, are used to align with the beginnings of notes. For this reason, each word should start with a vowel/AP/SP, and end with leading consonant(s) of the next word (if there are any). See the example below: 30 | 31 | ```text 32 | text | AP | shi | zhe | => word transcriptions (pinyin, romaji, etc.) 33 | ph_seq | AP | sh | ir | zh | e | => phoneme sequence 34 | ph_num | 2 | 2 | 1 | => word-level phoneme division 35 | ``` 36 | 37 | where `sh` and `zh` are consonants, `AP`, `ir` and `e` can be regarded as vowels. There are one special case that a word can start with a consonants: isolated consonants. In this case, all phones in the word are consonants. 38 | 39 | For all monosyllabic phoneme systems (at most one vowel in one word), this step can be performed automatically. 40 | 41 | ### 3.1 two-part dictionaries (Chinese, Japanese, etc.) 42 | 43 | A two-part dictionary has "V" and "C-V" phoneme patterns. 44 | 45 | Run: 46 | 47 | ```bash 48 | python add_ph_num.py path/to/your/transcriptions.csv --dictionary path/to/your/dictionary.txt 49 | ``` 50 | 51 | ### 3.2 monosyllabic phoneme systems (Cantonese, Korean, etc.) 52 | 53 | A universal monosyllabic phoneme system has "C(m)-V-C(n)" (m,n >= 0) phoneme patterns. 54 | 55 | 1. Collect all vowels into vowels.txt, divided by spaces. 56 | 57 | 2. Collect all consonants into consonants.txt, divided by spaces. 58 | 59 | 3. Run: 60 | 61 | ```bash 62 | python add_ph_num.py path/to/your/transcriptions.csv --vowels vowels.txt --consonants consonants.txt 63 | ``` 64 | 65 | ### 3.3 polysyllabic phoneme systems (English, Russian, etc.) 66 | 67 | We recommand this step be manually performed because word divisions cannot be infered from phoneme sequences in these phoneme systems. 68 | 69 | > After finishing this step, the transcriptions.csv file can be directly used to train the phoneme duration predictor. If you want to train a pitch predictor, you must finish the remaining steps as follows. 70 | > 71 | 72 | ## 4. Estimate note values 73 | 74 | The note tier is another division of words besides the phoneme tier. See the example below: 75 | 76 | ```text 77 | ph_seq | AP | sh | ir | zh | e | => phoneme sequence 78 | ph_num | 2 | 2 | 1 | => word-level phoneme division 79 | note_seq | rest | D#3 | D#3 | C4 | => note sequence 80 | note_slur | 0 | 0 | 0 | 1 | => slur flag (will not be stored) 81 | ``` 82 | 83 | Note sequences can be automatically estimated and manually refined in two ways. 84 | 85 | ### 4.1 Infer a rough pitch value for each word 86 | 87 | The following program can infer a rough note value for each word. There are no slurs - slurs are hard to judge, and different people have different labeling styles. 88 | 89 | Run: 90 | 91 | ```bash 92 | python estimate_midi.py path/to/your/transcriptions.csv path/to/your/wavs 93 | ``` 94 | 95 | > **IMPORTANT** 96 | > 97 | > This step only estimates the rough MIDI value for each word. You have to refine the MIDI sequences, otherwise the pitch predictor will not be accurate. 98 | 99 | ### 4.2 (New!) Use the AI-powered MIDI extractor - SOME 100 | 101 | SOME (Singing-Oriented MIDI Extractor) is a NN-based MIDI extractor developed under the DiffSinger ecosystem. See guidance [here](https://github.com/openvpi/SOME#inference-via-pretrained-model-diffsinger-dataset) for using it on your DiffSinger dataset. 102 | 103 | ## 5. Refine MIDI sequences 104 | 105 | ### 5.1 take apart transcriptions.csv into DS files 106 | 107 | Run: 108 | 109 | ```bash 110 | python convert_ds.py csv2ds path/to/your/transcriptions.csv path/to/your/wavs 111 | ``` 112 | 113 | This will generate *.ds files matching your *.wav files in the same directory. 114 | 115 | > **IMPORTANT** 116 | > 117 | > In this step, we highly recommend using RMVPE, a more accurate NN-based pitch extraction algorithm, to get better pitch results. See guidance [here](#rmvpe-pitch-extraction-algorithm). 118 | > 119 | > Also note that after you finish manual MIDI refinement, please use the **same algorithm** and **same model** in your DiffSinger configuration files for variance model training to get the best results. 120 | 121 | ### 5.2 manually edit MIDI sequences 122 | 123 | Get the latest release of SlurCutter from [here](../README.md#essential-tools-to-process-and-label-your-datasets). This simple tool helps you adjust MIDI pitch in each DS file and cut notes into slurs if neccessary. Be sure to back up your DS files before you start, since this tool will automatically save and overwrite an edited DS file. 124 | 125 | ### 5.3 re-combine DS files into transcriptions.csv 126 | 127 | Run: 128 | 129 | ```bash 130 | python convert_ds.py ds2csv path/to/your/ds path/to/your/transcriptions.csv 131 | ``` 132 | 133 | This will generate a new transcriptions.csv from the DS files you just edited. Append `-f` if you are sure you want to overwrite the original transcription file (and the script complains about it). 134 | 135 | Now the transcriptions.csv can be used for all functionalities of DiffSinger training. 136 | 137 | `convert_ds.py ds2csv` supports DS files which have no corresponding WAV files. All sentences in these files will be assigned a virtual item name, and inserted into the transcriptions. This is a preparation to support using DS tuning projects to train a variance model. In addition, `curves.json` file is written to support `f0` sequence refinement. 138 | 139 | ## (Appendix) other useful tools 140 | 141 | ### RMVPE pitch extraction algorithm 142 | 143 | convert_ds.py and estimate_midi.py supports the state-of-the-art RMVPE pitch extraction algorithm. To use it: 144 | 145 | - Install PyTorch via [official guidance](https://pytorch.org/get-started/locally/). 146 | - Get RMVPE pretrained model [here](https://github.com/yxlllc/RMVPE/releases). 147 | - Put the RMVPE model.pt in `variance-temp-solution/assets/rmvpe/`. 148 | - Use `--pe rmvpe` when running `python convert_ds.py csv2ds` or `python estimate_midi.py`. 149 | 150 | ### correct_cents.py 151 | 152 | Apply cents correction to note sequences in a transcriptions.csv to offset the out-of-tune errors. Need pitch extracted from waveforms for reference. 153 | 154 | Usage: 155 | 156 | ```bash 157 | python correct_cents.py csv path/to/your/transcriptions.csv path/to/your/wavs 158 | ``` 159 | 160 | or 161 | 162 | ```bash 163 | python correct_cents.py ds path/to/your/ds/files 164 | ``` 165 | 166 | Note: this operation will overwrite your input file(s). 167 | 168 | ### eliminate_short.py 169 | 170 | Eliminate short slur notes in DS files. Slurs that are shorter than a given threshold will be merged into its neighboring notes within the same word. 171 | 172 | Usage: 173 | 174 | ```bash 175 | python eliminate_short.py path/to/your/ds/files 176 | ``` 177 | 178 | Note: this operation will overwrite your input DS files. 179 | -------------------------------------------------------------------------------- /variance-temp-solution/add_ph_num.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pathlib 3 | 4 | import click 5 | 6 | 7 | @click.command(help='Add ph_num attribute into transcriptions.csv') 8 | @click.argument('transcription', metavar='TRANSCRIPTIONS') 9 | @click.option('--dictionary', metavar='DICTIONARY') 10 | @click.option('--vowels', metavar='FILE') 11 | @click.option('--consonants', metavar='FILE') 12 | def add_ph_num( 13 | transcription: str, 14 | dictionary: str = None, 15 | vowels: str = None, 16 | consonants: str = None 17 | ): 18 | assert dictionary is not None or (vowels is not None and consonants is not None), \ 19 | 'Either dictionary file or vowels and consonants file should be specified.' 20 | if dictionary is not None: 21 | dictionary = pathlib.Path(dictionary).resolve() 22 | vowels = {'SP', 'AP'} 23 | consonants = set() 24 | with open(dictionary, 'r', encoding='utf8') as f: 25 | rules = f.readlines() 26 | for r in rules: 27 | syllable, phonemes = r.split('\t') 28 | phonemes = phonemes.split() 29 | assert len(phonemes) <= 2, 'We only support two-phase dictionaries for automatically adding ph_num.' 30 | if len(phonemes) == 1: 31 | vowels.add(phonemes[0]) 32 | else: 33 | consonants.add(phonemes[0]) 34 | vowels.add(phonemes[1]) 35 | else: 36 | vowels_path = pathlib.Path(vowels).resolve() 37 | consonants_path = pathlib.Path(consonants).resolve() 38 | vowels = {'SP', 'AP'} 39 | consonants = set() 40 | with open(vowels_path, 'r', encoding='utf8') as f: 41 | vowels.update(f.read().split()) 42 | with open(consonants_path, 'r', encoding='utf8') as f: 43 | consonants.update(f.read().split()) 44 | overlapped = vowels.intersection(consonants) 45 | assert len(vowels.intersection(consonants)) == 0, \ 46 | 'Vowel set and consonant set overlapped. The following phonemes ' \ 47 | 'appear both as vowels and as consonants:\n' \ 48 | f'{sorted(overlapped)}' 49 | 50 | transcription = pathlib.Path(transcription).resolve() 51 | items: list[dict] = [] 52 | with open(transcription, 'r', encoding='utf8') as f: 53 | reader = csv.DictReader(f) 54 | for item in reader: 55 | items.append(item) 56 | 57 | for item in items: 58 | item: dict 59 | ph_seq = item['ph_seq'].split() 60 | for ph in ph_seq: 61 | assert ph in vowels or ph in consonants, \ 62 | f'Invalid phoneme symbol \'{ph}\' in \'{item["name"]}\'.' 63 | ph_num = [] 64 | i = 0 65 | while i < len(ph_seq): 66 | j = i + 1 67 | while j < len(ph_seq) and ph_seq[j] in consonants: 68 | j += 1 69 | ph_num.append(str(j - i)) 70 | i = j 71 | item['ph_num'] = ' '.join(ph_num) 72 | 73 | with open(transcription, 'w', encoding='utf8', newline='') as f: 74 | writer = csv.DictWriter(f, fieldnames=items[0].keys()) 75 | writer.writeheader() 76 | writer.writerows(items) 77 | 78 | 79 | if __name__ == '__main__': 80 | add_ph_num() 81 | -------------------------------------------------------------------------------- /variance-temp-solution/add_ph_num_advanced.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pathlib 3 | from typing import Tuple, List 4 | 5 | import click 6 | import textgrid 7 | 8 | 9 | class RuleTerm: 10 | def __init__(self, key: str, is_wildcard: bool = False): 11 | self.key = key 12 | self.is_wildcard = is_wildcard 13 | 14 | def __repr__(self): 15 | if self.is_wildcard: 16 | return f"*{self.key}" 17 | return self.key 18 | 19 | 20 | class QueryTerm: 21 | def __init__(self, specified_key: str = None, wildcard_key: str = None): 22 | self.specified_key = specified_key 23 | self.wildcard_key = wildcard_key 24 | 25 | def __repr__(self): 26 | return str((self.specified_key, self.wildcard_key)) 27 | 28 | 29 | class TrieNode: 30 | def __init__(self): 31 | self.children = {} 32 | self.wildcards = {} 33 | self.value = None 34 | 35 | def __setitem__(self, key: Tuple[RuleTerm], value): 36 | if not key: 37 | self.value = value 38 | else: 39 | term, *key = key 40 | if term.is_wildcard: 41 | if term.key not in self.wildcards: 42 | self.wildcards[term.key] = TrieNode() 43 | self.wildcards[term.key][(*key,)] = value 44 | else: 45 | if term.key not in self.children: 46 | self.children[term.key] = TrieNode() 47 | self.children[term.key][(*key,)] = value 48 | 49 | def __getitem__(self, key: Tuple[RuleTerm]): 50 | if not key: 51 | return self.value 52 | term, *key = key 53 | if term.is_wildcard: 54 | if term.key not in self.wildcards: 55 | return None 56 | return self.wildcards[term.key][(*key,)] 57 | if term.key not in self.children: 58 | return None 59 | return self.children[term.key][(*key,)] 60 | 61 | def find_paths(self, query: List[QueryTerm]) -> List[RuleTerm]: 62 | if not query: 63 | return [] 64 | term, *query = query 65 | paths = [] 66 | if term.specified_key in self.children: 67 | if self.children[term.specified_key].value is not None: 68 | paths.append([RuleTerm(term.specified_key, False)]) 69 | for path in self.children[term.specified_key].find_paths(query): 70 | paths.append([RuleTerm(term.specified_key, False), *path]) 71 | if term.wildcard_key in self.wildcards: 72 | if self.wildcards[term.wildcard_key].value is not None: 73 | paths.append([RuleTerm(term.wildcard_key, True)]) 74 | for path in self.wildcards[term.wildcard_key].find_paths(query): 75 | paths.append([RuleTerm(term.wildcard_key, True), *path]) 76 | return paths 77 | 78 | def find_best_path(self, query: Tuple[QueryTerm]) -> List[RuleTerm]: 79 | paths = self.find_paths(list(query)) 80 | return max( 81 | paths, 82 | default=None, 83 | key=lambda p: ( 84 | len(p), 85 | sum(not t.is_wildcard for t in p), 86 | min(enumerate(p), key=lambda e: (not e[1].is_wildcard, e[0]))[0] 87 | ) 88 | ) 89 | 90 | 91 | CONSONANT = 0 92 | VOWEL = 1 93 | LIQUID = 2 94 | 95 | 96 | @click.command(help='Add ph_num attribute into transcriptions.csv (advanced mode)') 97 | @click.argument( 98 | 'transcription', 99 | type=click.Path(exists=True, readable=True, path_type=pathlib.Path), 100 | metavar='TRANSCRIPTIONS' 101 | ) 102 | @click.option( 103 | '--tg', required=True, 104 | type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=pathlib.Path), 105 | help='Path to TextGrids' 106 | ) 107 | @click.option( 108 | '--vowels', 109 | type=click.Path(exists=True, readable=True, path_type=pathlib.Path), 110 | metavar='FILE', 111 | help='Path to the file containing vowels' 112 | ) 113 | @click.option( 114 | '--consonants', 115 | type=click.Path(exists=True, readable=True, path_type=pathlib.Path), 116 | metavar='FILE', 117 | help='Path to the file containing consonants' 118 | ) 119 | @click.option( 120 | '--liquids', 121 | type=click.Path(exists=True, readable=True, path_type=pathlib.Path), 122 | metavar='FILE', 123 | help='Path to the file containing liquids' 124 | ) 125 | def add_ph_num_advanced( 126 | transcription: pathlib.Path, 127 | tg: pathlib.Path, 128 | vowels: pathlib.Path = None, 129 | consonants: pathlib.Path = None, 130 | liquids: pathlib.Path = None 131 | ): 132 | with open(transcription, 'r', encoding='utf8') as f: 133 | reader = csv.DictReader(f) 134 | items = list(reader) 135 | phoneme_type_map = { 136 | 'AP': VOWEL, 137 | 'SP': VOWEL, 138 | } 139 | if vowels is not None: 140 | with open(vowels, 'r', encoding='utf8') as f: 141 | for v in f.read().split(): 142 | phoneme_type_map[v] = VOWEL 143 | if consonants is not None: 144 | with open(consonants, 'r', encoding='utf8') as f: 145 | for c in f.read().split(): 146 | phoneme_type_map[c] = CONSONANT 147 | if liquids is not None: 148 | with open(liquids, 'r', encoding='utf8') as f: 149 | for l in f.read().split(): 150 | phoneme_type_map[l] = LIQUID 151 | 152 | trie = TrieNode() 153 | trie[( 154 | RuleTerm(VOWEL, True), 155 | )] = [0] 156 | trie[( 157 | RuleTerm(CONSONANT, True), 158 | RuleTerm(LIQUID, True), 159 | RuleTerm(VOWEL, True), 160 | )] = [1] 161 | trie[( 162 | RuleTerm(LIQUID, True), 163 | RuleTerm(LIQUID, True), 164 | RuleTerm(VOWEL, True), 165 | )] = [1] 166 | 167 | for item in items: 168 | name = item['name'] 169 | tg_path = tg / f"{name}.TextGrid" 170 | tg_obj = textgrid.TextGrid() 171 | tg_obj.read(tg_path, encoding='utf8') 172 | words_tier = tg_obj[0] 173 | phones_tier = tg_obj[1] 174 | 175 | if item['ph_seq'].split() != [i.mark for i in phones_tier]: 176 | raise ValueError(f"Error: ph_seq mismatch in item: {name}") 177 | for phone_idx, phone_interval in enumerate(phones_tier): 178 | if phone_interval.mark not in phoneme_type_map: 179 | raise ValueError( 180 | f"Error: invalid phone in item: {name}, index: {phone_idx}, phone: {phone_interval.mark}" 181 | ) 182 | 183 | is_onset = [] 184 | for word_idx, word_interval in enumerate(words_tier): 185 | start_ph_idx = min( 186 | enumerate(tg_obj[1]), 187 | key=lambda e: abs(e[1].minTime - word_interval.minTime) 188 | )[0] 189 | end_ph_idx = min( 190 | enumerate(tg_obj[1]), 191 | key=lambda e: abs(e[1].maxTime - word_interval.maxTime) 192 | )[0] 193 | if phones_tier[start_ph_idx].minTime != word_interval.minTime: 194 | raise ValueError( 195 | f"Error: word minTime not aligned to phone minTime in item: " 196 | f"{name}, index: {word_idx}, word: {word_interval.mark}" 197 | ) 198 | if phones_tier[end_ph_idx].maxTime != word_interval.maxTime: 199 | raise ValueError( 200 | f"Error: word maxTime not aligned to phone maxTime in item: " 201 | f"{name}, index: {word_idx}, word: {word_interval.mark}" 202 | ) 203 | word_phones = [i.mark for i in phones_tier[start_ph_idx:end_ph_idx + 1]] 204 | i = 0 205 | while i < len(word_phones): 206 | query = [ 207 | QueryTerm(specified_key=ph, wildcard_key=phoneme_type_map[ph]) 208 | for ph in word_phones[i:] 209 | ] 210 | best_path = trie.find_best_path(query) 211 | if not best_path: 212 | is_onset.append(False) 213 | i += 1 214 | continue 215 | onsets = trie[best_path] 216 | is_onset.extend( 217 | j in onsets 218 | for j in range(len(best_path)) 219 | ) 220 | i += len(best_path) 221 | acc = 0 222 | ph_num = [] 223 | for flag in is_onset: 224 | if flag: 225 | if acc > 0: 226 | ph_num.append(acc) 227 | acc = 1 228 | else: 229 | acc += 1 230 | if acc > 0: 231 | ph_num.append(acc) 232 | item['ph_num'] = ' '.join(str(n) for n in ph_num) 233 | 234 | with open(transcription, 'w', encoding='utf8', newline='') as f: 235 | writer = csv.DictWriter(f, fieldnames=items[0].keys()) 236 | writer.writeheader() 237 | writer.writerows(items) 238 | 239 | 240 | if __name__ == '__main__': 241 | add_ph_num_advanced() 242 | -------------------------------------------------------------------------------- /variance-temp-solution/assets/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openvpi/MakeDiffSinger/ca134d36dc8eec06002a72cd0a59257abcf7bb84/variance-temp-solution/assets/.gitkeep -------------------------------------------------------------------------------- /variance-temp-solution/convert_ds.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import pathlib 4 | from decimal import Decimal 5 | from math import isclose 6 | 7 | import click 8 | import librosa 9 | import numpy as np 10 | from tqdm import tqdm 11 | 12 | from get_pitch import get_pitch 13 | 14 | 15 | def try_resolve_note_slur_by_matching(ph_dur, ph_num, note_dur, tol): 16 | if len(ph_num) > len(note_dur): 17 | raise ValueError("ph_num should not be longer than note_dur.") 18 | ph_num_cum = np.cumsum([0] + ph_num) 19 | word_pos = np.cumsum([sum(ph_dur[l:r]) for l, r in zip(ph_num_cum[:-1], ph_num_cum[1:])]) 20 | note_pos = np.cumsum(note_dur) 21 | new_note_dur = [] 22 | 23 | note_slur = [] 24 | idx_word, idx_note = 0, 0 25 | slur = False 26 | while idx_word < len(word_pos) and idx_note < len(note_pos): 27 | if isclose(word_pos[idx_word], note_pos[idx_note], abs_tol=tol): 28 | note_slur.append(1 if slur else 0) 29 | new_note_dur.append(word_pos[idx_word]) 30 | idx_word += 1 31 | idx_note += 1 32 | slur = False 33 | elif note_pos[idx_note] > word_pos[idx_word]: 34 | raise ValueError("Cannot resolve note_slur by matching.") 35 | elif note_pos[idx_note] <= word_pos[idx_word]: 36 | note_slur.append(1 if slur else 0) 37 | new_note_dur.append(note_pos[idx_note]) 38 | idx_note += 1 39 | slur = True 40 | ret_note_dur = np.diff(new_note_dur, prepend=Decimal("0.0")).tolist() 41 | assert len(ret_note_dur) == len(note_slur) 42 | return ret_note_dur, note_slur 43 | 44 | 45 | def try_resolve_slur_by_slicing(ph_dur, ph_num, note_seq, note_dur, tol): 46 | ph_num_cum = np.cumsum([0] + ph_num) 47 | word_pos = np.cumsum([sum(ph_dur[l:r]) for l, r in zip(ph_num_cum[:-1], ph_num_cum[1:])]) 48 | note_pos = np.cumsum(note_dur) 49 | new_note_seq = [] 50 | new_note_dur = [] 51 | 52 | note_slur = [] 53 | idx_word, idx_note = 0, 0 54 | while idx_word < len(word_pos): 55 | slur = False 56 | if note_pos[idx_note] > word_pos[idx_word] and not isclose( 57 | note_pos[idx_note], word_pos[idx_word], abs_tol=tol 58 | ): 59 | new_note_seq.append(note_seq[idx_note]) 60 | new_note_dur.append(word_pos[idx_word]) 61 | note_slur.append(1 if slur else 0) 62 | else: 63 | while idx_note < len(note_pos) and ( 64 | note_pos[idx_note] < word_pos[idx_word] 65 | or isclose(note_pos[idx_note], word_pos[idx_word], abs_tol=tol) 66 | ): 67 | new_note_seq.append(note_seq[idx_note]) 68 | new_note_dur.append(note_pos[idx_note]) 69 | note_slur.append(1 if slur else 0) 70 | slur = True 71 | idx_note += 1 72 | if new_note_dur[-1] < word_pos[idx_word]: 73 | if isclose(new_note_dur[-1], word_pos[idx_word], abs_tol=tol): 74 | new_note_dur[-1] = word_pos[idx_word] 75 | else: 76 | new_note_seq.append(note_seq[idx_note]) 77 | new_note_dur.append(word_pos[idx_word]) 78 | note_slur.append(1 if slur else 0) 79 | idx_word += 1 80 | ret_note_dur = np.diff(new_note_dur, prepend=Decimal("0.0")).tolist() 81 | assert len(new_note_seq) == len(ret_note_dur) == len(note_slur) 82 | return new_note_seq, ret_note_dur, note_slur 83 | 84 | 85 | @click.group() 86 | def cli(): 87 | pass 88 | 89 | 90 | @click.command(help="Convert a transcription file to DS files") 91 | @click.argument( 92 | "transcription_file", 93 | type=click.Path( 94 | dir_okay=False, 95 | resolve_path=True, 96 | path_type=pathlib.Path, 97 | exists=True, 98 | readable=True, 99 | ), 100 | metavar="TRANSCRIPTIONS", 101 | ) 102 | @click.argument( 103 | "wavs_folder", 104 | type=click.Path(file_okay=False, resolve_path=True, path_type=pathlib.Path), 105 | metavar="FOLDER", 106 | ) 107 | @click.option( 108 | "--tolerance", 109 | "-t", 110 | type=float, 111 | default=0.005, 112 | help="Tolerance for ph_dur/note_dur mismatch", 113 | metavar="FLOAT", 114 | ) 115 | @click.option( 116 | "--hop_size", "-h", type=int, default=512, help="Hop size for f0_seq", metavar="INT" 117 | ) 118 | @click.option( 119 | "--sample_rate", 120 | "-s", 121 | type=int, 122 | default=44100, 123 | help="Sample rate of audio", 124 | metavar="INT", 125 | ) 126 | @click.option( 127 | "--pe", 128 | type=str, 129 | default="parselmouth", 130 | help="Pitch extractor (parselmouth, rmvpe)", 131 | metavar="ALGORITHM", 132 | ) 133 | def csv2ds(transcription_file, wavs_folder, tolerance, hop_size, sample_rate, pe): 134 | """Convert a transcription file to DS file""" 135 | assert wavs_folder.is_dir(), "wavs folder not found." 136 | out_ds = {} 137 | out_exists = [] 138 | with open(transcription_file, "r", encoding="utf-8") as f: 139 | for trans_line in tqdm(csv.DictReader(f)): 140 | item_name = trans_line["name"] 141 | wav_fn = wavs_folder / f"{item_name}.wav" 142 | ds_fn = wavs_folder / f"{item_name}.ds" 143 | ph_dur = list(map(Decimal, trans_line["ph_dur"].strip().split())) 144 | ph_num = list(map(int, trans_line["ph_num"].strip().split())) 145 | note_seq = trans_line["note_seq"].strip().split() 146 | note_dur = list(map(Decimal, trans_line["note_dur"].strip().split())) 147 | note_glide = trans_line["note_glide"].strip().split() if "note_glide" in trans_line else None 148 | 149 | assert wav_fn.is_file(), f"{item_name}.wav not found." 150 | assert len(ph_dur) == sum(ph_num), "ph_dur and ph_num mismatch." 151 | assert len(note_seq) == len(note_dur), "note_seq and note_dur should have the same length." 152 | if note_glide: 153 | assert len(note_glide) == len(note_seq), "note_glide and note_seq should have the same length." 154 | assert isclose( 155 | sum(ph_dur), sum(note_dur), abs_tol=tolerance 156 | ), f"[{item_name}] ERROR: mismatch total duration: {sum(ph_dur) - sum(note_dur)}" 157 | 158 | # Resolve note_slur 159 | if "note_slur" in trans_line and trans_line["note_slur"]: 160 | note_slur = list(map(int, trans_line["note_slur"].strip().split())) 161 | else: 162 | try: 163 | note_dur, note_slur = try_resolve_note_slur_by_matching( 164 | ph_dur, ph_num, note_dur, tolerance 165 | ) 166 | except ValueError: 167 | # logging.warning(f"note_slur is not resolved by matching for {item_name}") 168 | note_seq, note_dur, note_slur = try_resolve_slur_by_slicing( 169 | ph_dur, ph_num, note_seq, note_dur, tolerance 170 | ) 171 | # Extract f0_seq 172 | wav, _ = librosa.load(wav_fn, sr=sample_rate, mono=True) 173 | # length = len(wav) + (win_size - hop_size) // 2 + (win_size - hop_size + 1) // 2 174 | # length = ceil((length - win_size) / hop_size) 175 | f0_timestep, f0, _ = get_pitch(pe, wav, hop_size, sample_rate) 176 | ds_content = [ 177 | { 178 | "offset": 0.0, 179 | "text": trans_line["ph_seq"], 180 | "ph_seq": trans_line["ph_seq"], 181 | "ph_dur": " ".join(str(round(d, 6)) for d in ph_dur), 182 | "ph_num": trans_line["ph_num"], 183 | "note_seq": " ".join(note_seq), 184 | "note_dur": " ".join(str(round(d, 6)) for d in note_dur), 185 | "note_slur": " ".join(map(str, note_slur)), 186 | "f0_seq": " ".join(map("{:.1f}".format, f0)), 187 | "f0_timestep": str(f0_timestep), 188 | } 189 | ] 190 | if note_glide: 191 | ds_content[0]["note_glide"] = " ".join(note_glide) 192 | out_ds[ds_fn] = ds_content 193 | if ds_fn.exists(): 194 | out_exists.append(ds_fn) 195 | if not out_exists or click.confirm(f"Overwrite {len(out_exists)} existing DS files?", abort=False): 196 | for ds_fn, ds_content in out_ds.items(): 197 | with open(ds_fn, "w", encoding="utf-8") as f: 198 | json.dump(ds_content, f, ensure_ascii=False, indent=4) 199 | else: 200 | click.echo("Aborted.") 201 | 202 | 203 | @click.command(help="Convert DS files to a transcription and curve files") 204 | @click.argument( 205 | "ds_folder", 206 | type=click.Path(file_okay=False, resolve_path=True, exists=True, path_type=pathlib.Path), 207 | metavar="FOLDER", 208 | ) 209 | @click.argument( 210 | "transcription_file", 211 | type=click.Path(file_okay=True, dir_okay=False, resolve_path=True, path_type=pathlib.Path), 212 | metavar="TRANSCRIPTIONS", 213 | ) 214 | @click.option( 215 | "--overwrite", 216 | "-f", 217 | is_flag=True, 218 | default=False, 219 | help="Overwrite existing transcription file", 220 | ) 221 | def ds2csv(ds_folder, transcription_file, overwrite): 222 | """Convert DS files to a transcription file""" 223 | if not overwrite and transcription_file.exists(): 224 | raise FileExistsError(f"{transcription_file} already exist.") 225 | 226 | transcriptions = [] 227 | any_with_glide = False 228 | # records that have corresponding wav files, assuming it's midi annotation 229 | for fp in tqdm(ds_folder.glob("*.ds"), ncols=80): 230 | if fp.with_suffix(".wav").exists(): 231 | with open(fp, "r", encoding="utf-8") as f: 232 | ds = json.load(f) 233 | transcriptions.append( 234 | { 235 | "name": fp.stem, 236 | "ph_seq": ds[0]["ph_seq"], 237 | "ph_dur": " ".join(str(round(Decimal(d), 6)) for d in ds[0]["ph_dur"].split()), 238 | "ph_num": ds[0]["ph_num"], 239 | "note_seq": ds[0]["note_seq"], 240 | "note_dur": " ".join(str(round(Decimal(d), 6)) for d in ds[0]["note_dur"].split()), 241 | # "note_slur": ds[0]["note_slur"], 242 | } 243 | ) 244 | if "note_glide" in ds[0]: 245 | any_with_glide = True 246 | transcriptions[-1]["note_glide"] = ds[0]["note_glide"] 247 | # Lone DS files. 248 | for fp in tqdm(ds_folder.glob("*.ds"), ncols=80): 249 | if not fp.with_suffix(".wav").exists(): 250 | with open(fp, "r", encoding="utf-8") as f: 251 | ds = json.load(f) 252 | for idx, sub_ds in enumerate(ds): 253 | item_name = f"{fp.stem}#{idx}" if len(ds) > 1 else fp.stem 254 | transcriptions.append( 255 | { 256 | "name": item_name, 257 | "ph_seq": sub_ds["ph_seq"], 258 | "ph_dur": " ".join(str(round(Decimal(d), 6)) for d in sub_ds["ph_dur"].split()), 259 | "ph_num": sub_ds["ph_num"], 260 | "note_seq": sub_ds["note_seq"], 261 | "note_dur": " ".join(str(round(Decimal(d), 6)) for d in sub_ds["note_dur"].split()), 262 | # "note_slur": sub_ds["note_slur"], 263 | } 264 | ) 265 | if "note_glide" in sub_ds: 266 | any_with_glide = True 267 | transcriptions[-1]["note_glide"] = sub_ds["note_glide"] 268 | if any_with_glide: 269 | for row in transcriptions: 270 | if "note_glide" not in row: 271 | row["note_glide"] = " ".join(["none"] * len(row["note_seq"].split())) 272 | with open(transcription_file, "w", newline="", encoding="utf-8") as f: 273 | writer = csv.DictWriter( 274 | f, 275 | fieldnames=[ 276 | "name", 277 | "ph_seq", 278 | "ph_dur", 279 | "ph_num", 280 | "note_seq", 281 | "note_dur", 282 | # "note_slur", 283 | ] + (["note_glide"] if any_with_glide else []), 284 | ) 285 | writer.writeheader() 286 | writer.writerows(transcriptions) 287 | 288 | 289 | cli.add_command(csv2ds) 290 | cli.add_command(ds2csv) 291 | 292 | if __name__ == "__main__": 293 | cli() 294 | -------------------------------------------------------------------------------- /variance-temp-solution/convert_txt.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pathlib 3 | 4 | import click 5 | 6 | 7 | @click.command(help='Migrate transcriptions.txt in old datasets to transcriptions.csv') 8 | @click.argument('input_txt', metavar='INPUT') 9 | def convert_txt( 10 | input_txt: str 11 | ): 12 | input_txt = pathlib.Path(input_txt).resolve() 13 | assert input_txt.exists(), 'The input file does not exist.' 14 | with open(input_txt, 'r', encoding='utf8') as f: 15 | utterances = f.readlines() 16 | utterances = [u.split('|') for u in utterances] 17 | utterances = [ 18 | { 19 | 'name': u[0], 20 | 'ph_seq': u[2], 21 | 'ph_dur': u[5] 22 | } 23 | for u in utterances 24 | ] 25 | 26 | with open(input_txt.with_suffix('.csv'), 'w', encoding='utf8', newline='') as f: 27 | writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur']) 28 | writer.writeheader() 29 | writer.writerows(utterances) 30 | 31 | 32 | if __name__ == '__main__': 33 | convert_txt() 34 | -------------------------------------------------------------------------------- /variance-temp-solution/correct_cents.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import warnings 4 | from collections import OrderedDict 5 | 6 | import librosa 7 | import numpy as np 8 | import tqdm 9 | import pathlib 10 | from csv import DictReader, DictWriter 11 | 12 | import click 13 | 14 | from get_pitch import get_pitch_parselmouth 15 | 16 | warns = [] 17 | 18 | 19 | def get_aligned_pitch(wav_path: pathlib.Path, total_secs: float, timestep: float): 20 | waveform, _ = librosa.load(wav_path, sr=44100, mono=True) 21 | _, f0, _ = get_pitch_parselmouth(waveform, 512, 44100) 22 | pitch = librosa.hz_to_midi(f0) 23 | if pitch.shape[0] < total_secs / timestep: 24 | pad = math.ceil(total_secs / timestep) - pitch.shape[0] 25 | pitch = np.pad(pitch, [0, pad], mode='constant', constant_values=[0, pitch[-1]]) 26 | return pitch 27 | 28 | 29 | def correct_cents_item( 30 | name: str, item: OrderedDict, ref_pitch: np.ndarray, 31 | timestep: float, error_ratio: float 32 | ): 33 | note_seq = item['note_seq'].split() 34 | note_dur = [float(d) for d in item['note_dur'].split()] 35 | assert len(note_seq) == len(note_dur) 36 | 37 | start = 0. 38 | note_seq_correct = [] 39 | for i, (note, dur) in enumerate(zip(note_seq, note_dur)): 40 | end = start + dur 41 | if note == 'rest': 42 | start = end 43 | note_seq_correct.append('rest') 44 | continue 45 | 46 | midi = librosa.note_to_midi(note, round_midi=False) 47 | start_idx = math.floor(start / timestep) 48 | end_idx = math.ceil(end / timestep) 49 | note_pitch = ref_pitch[start_idx: end_idx] 50 | note_pitch_close = note_pitch[(note_pitch >= midi - 0.5) & (note_pitch < midi + 0.5)] 51 | if len(note_pitch_close) < len(note_pitch) * error_ratio or len(note_pitch) == 0: 52 | warns.append({ 53 | 'position': name, 54 | 'note_index': i, 55 | 'note_value': note 56 | }) 57 | if len(note_pitch) == 0 or len(note_pitch_close) == 0: 58 | start = end 59 | note_seq_correct.append(note) 60 | continue 61 | midi_correct = np.mean(note_pitch_close) 62 | note_seq_correct.append(librosa.midi_to_note(midi_correct, cents=True, unicode=False)) 63 | 64 | start = end 65 | 66 | item['note_seq'] = ' '.join(note_seq_correct) 67 | 68 | 69 | def save_warnings(save_dir: pathlib.Path): 70 | if len(warns) > 0: 71 | save_path = save_dir.resolve() / 'warnings.csv' 72 | with open(save_path, 'w', encoding='utf8', newline='') as f: 73 | writer = DictWriter(f, fieldnames=['position', 'note_index', 'note_value']) 74 | writer.writeheader() 75 | writer.writerows(warns) 76 | warnings.warn( 77 | message=f'possible labeling errors saved in {save_path}', 78 | category=UserWarning 79 | ) 80 | warnings.filterwarnings(action='default') 81 | 82 | 83 | @click.group(help='Apply cents correction to note sequences') 84 | def correct_cents(): 85 | pass 86 | 87 | 88 | @correct_cents.command(help='Apply cents correction to note sequences in transcriptions.csv') 89 | @click.argument('transcriptions', metavar='TRANSCRIPTIONS') 90 | @click.argument('waveforms', metavar='WAVS') 91 | @click.option('--error_ratio', metavar='RATIO', type=float, default=0.4, 92 | help='If the percentage of pitch points within a deviation of 50 cents compared to the note label ' 93 | 'is lower than this value, a warning will be raised.') 94 | def csv( 95 | transcriptions, 96 | waveforms, 97 | error_ratio 98 | ): 99 | transcriptions = pathlib.Path(transcriptions).resolve() 100 | waveforms = pathlib.Path(waveforms).resolve() 101 | with open(transcriptions, 'r', encoding='utf8') as f: 102 | reader = DictReader(f) 103 | items: list[OrderedDict] = [] 104 | for item in reader: 105 | items.append(OrderedDict(item)) 106 | 107 | timestep = 512 / 44100 108 | for item in tqdm.tqdm(items): 109 | item: OrderedDict 110 | ref_pitch = get_aligned_pitch( 111 | wav_path=waveforms / (item['name'] + '.wav'), 112 | total_secs=sum(float(d) for d in item['note_dur'].split()), 113 | timestep=timestep 114 | ) 115 | correct_cents_item( 116 | name=item['name'], item=item, ref_pitch=ref_pitch, 117 | timestep=timestep, error_ratio=error_ratio 118 | ) 119 | 120 | with open(transcriptions, 'w', encoding='utf8', newline='') as f: 121 | writer = DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num', 'note_seq', 'note_dur']) 122 | writer.writeheader() 123 | writer.writerows(items) 124 | save_warnings(transcriptions.parent) 125 | 126 | 127 | @correct_cents.command(help='Apply cents correction to note sequences in DS files') 128 | @click.argument('ds_dir', metavar='DS_DIR') 129 | @click.option('--error_ratio', metavar='RATIO', type=float, default=0.4, 130 | help='If the percentage of pitch points within a deviation of 50 cents compared to the note label ' 131 | 'is lower than this value, a warning will be raised.') 132 | def ds( 133 | ds_dir, 134 | error_ratio 135 | ): 136 | ds_dir = pathlib.Path(ds_dir).resolve() 137 | assert ds_dir.exists(), 'The directory of DS files does not exist.' 138 | 139 | timestep = 512 / 44100 140 | for ds_file in tqdm.tqdm(ds_dir.glob('*.ds')): 141 | if not ds_file.is_file(): 142 | continue 143 | 144 | assert ds_file.with_suffix('.wav').exists(), \ 145 | f'Missing corresponding .wav file of {ds_file.name}.' 146 | with open(ds_file, 'r', encoding='utf8') as f: 147 | params = json.load(f) 148 | if not isinstance(params, list): 149 | params = [params] 150 | params = [OrderedDict(p) for p in params] 151 | 152 | ref_pitch = get_aligned_pitch( 153 | wav_path=ds_file.with_suffix('.wav'), 154 | total_secs=params[-1]['offset'] + sum(float(d) for d in params[-1]['note_dur'].split()), 155 | timestep=timestep 156 | ) 157 | for i, param in enumerate(params): 158 | start_idx = math.floor(param['offset'] / timestep) 159 | end_idx = math.ceil((param['offset'] + sum(float(d) for d in param['note_dur'].split())) / timestep) 160 | correct_cents_item( 161 | name=f'{ds_file.stem}#{i}', item=param, ref_pitch=ref_pitch[start_idx: end_idx], 162 | timestep=timestep, error_ratio=error_ratio 163 | ) 164 | 165 | with open(ds_file, 'w', encoding='utf8') as f: 166 | json.dump(params, f, ensure_ascii=False, indent=2) 167 | save_warnings(ds_dir) 168 | 169 | 170 | if __name__ == '__main__': 171 | correct_cents() 172 | -------------------------------------------------------------------------------- /variance-temp-solution/eliminate_short.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | from collections import OrderedDict 4 | 5 | import click 6 | 7 | 8 | @click.command(help='Eliminate short slur notes in DS files') 9 | @click.argument('ds_dir', metavar='DS_DIR') 10 | @click.argument('threshold', type=float, metavar='THRESHOLD') 11 | def eliminate_short( 12 | ds_dir, 13 | threshold: float 14 | ): 15 | ds_dir = pathlib.Path(ds_dir).resolve() 16 | assert ds_dir.exists(), 'The directory of DS files does not exist.' 17 | 18 | for ds in ds_dir.iterdir(): 19 | if not ds.is_file() or ds.suffix != '.ds': 20 | continue 21 | 22 | with open(ds, 'r', encoding='utf8') as f: 23 | params = json.load(f) 24 | if not isinstance(params, list): 25 | params = [params] 26 | params = [OrderedDict(p) for p in params] 27 | 28 | for param in params: 29 | note_list = [ 30 | (note, float(dur), bool(int(slur))) 31 | for note, dur, slur 32 | in zip(param['note_seq'].split(), param['note_dur'].split(), param['note_slur'].split()) 33 | ] 34 | word_note_div = [] 35 | cache = [] 36 | for note in note_list: 37 | if len(cache) == 0 or note[2]: 38 | cache.append(note) 39 | else: 40 | word_note_div.append(cache) 41 | cache = [note] 42 | if len(cache) > 0: 43 | word_note_div.append(cache) 44 | 45 | word_note_div_new = [] 46 | for i in range(len(word_note_div)): 47 | word_note_seq = word_note_div[i] 48 | if len(word_note_seq) == 1 or all(n[1] < threshold for n in word_note_seq): 49 | word_note_div_new.append(word_note_seq) 50 | continue 51 | 52 | word_note_seq_new = [] 53 | j = 0 54 | prev_merge = 0. 55 | while word_note_seq[j][1] < threshold: 56 | # Enumerate leading short notes 57 | prev_merge += word_note_seq[j][1] 58 | j += 1 59 | # Iter note sequence 60 | while j < len(word_note_seq): 61 | k = j + 1 62 | while k < len(word_note_seq) and word_note_seq[k][1] < threshold: 63 | k += 1 64 | post_merge = sum(n[1] for n in word_note_seq[j + 1: k]) 65 | if k < len(word_note_seq): 66 | post_merge /= 2 67 | word_note_seq_new.append( 68 | (word_note_seq[j][0], prev_merge + word_note_seq[j][1] + post_merge, False) 69 | ) 70 | prev_merge = post_merge 71 | j = k 72 | 73 | word_note_div_new.append(word_note_seq_new) 74 | 75 | note_seq_new = [] 76 | note_dur_new = [] 77 | note_slur_new = [] 78 | for word_note_seq in word_note_div_new: 79 | note_seq_new += [n[0] for n in word_note_seq] 80 | note_dur_new += [n[1] for n in word_note_seq] 81 | note_slur_new += [pos > 0 for pos in range(len(word_note_seq))] 82 | param['note_seq'] = ' '.join(note_seq_new) 83 | param['note_dur'] = ' '.join(str(round(d, 6)) for d in note_dur_new) 84 | param['note_slur'] = ' '.join(str(int(s)) for s in note_slur_new) 85 | 86 | with open(ds, 'w', encoding='utf8') as f: 87 | json.dump(params, f, ensure_ascii=False, indent=2) 88 | 89 | 90 | if __name__ == '__main__': 91 | eliminate_short() 92 | -------------------------------------------------------------------------------- /variance-temp-solution/estimate_midi.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import math 3 | import pathlib 4 | 5 | import click 6 | import librosa 7 | import numpy as np 8 | import tqdm 9 | from typing import List 10 | 11 | from get_pitch import get_pitch 12 | 13 | 14 | @click.command(help='Estimate note pitch from transcriptions and corresponding waveforms') 15 | @click.argument('transcriptions', metavar='TRANSCRIPTIONS') 16 | @click.argument('waveforms', metavar='WAVS') 17 | @click.option('--pe', metavar='ALGORITHM', default='parselmouth', 18 | help='Pitch extractor (parselmouth, rmvpe)') 19 | @click.option('--rest_uv_ratio', metavar='RATIO', type=float, default=0.85, 20 | help='The minimum percentage of unvoiced length for a note to be regarded as rest') 21 | def estimate_midi( 22 | transcriptions: str, 23 | waveforms: str, 24 | pe: str = 'parselmouth', 25 | rest_uv_ratio: float = 0.85 26 | ): 27 | transcriptions = pathlib.Path(transcriptions).resolve() 28 | waveforms = pathlib.Path(waveforms).resolve() 29 | with open(transcriptions, 'r', encoding='utf8') as f: 30 | reader = csv.DictReader(f) 31 | items: List[dict] = [] 32 | for item in reader: 33 | items.append(item) 34 | 35 | timestep = 512 / 44100 36 | for item in tqdm.tqdm(items): 37 | item: dict 38 | ph_dur = [float(d) for d in item['ph_dur'].split()] 39 | ph_num = [int(n) for n in item['ph_num'].split()] 40 | assert sum(ph_num) == len(ph_dur), f'ph_num does not sum to number of phones in \'{item["name"]}\'.' 41 | 42 | word_dur = [] 43 | i = 0 44 | for num in ph_num: 45 | word_dur.append(sum(ph_dur[i: i + num])) 46 | i += num 47 | 48 | total_secs = sum(ph_dur) 49 | waveform, _ = librosa.load(waveforms / (item['name'] + '.wav'), sr=44100, mono=True) 50 | _, f0, uv = get_pitch(pe, waveform, 512, 44100) 51 | pitch = librosa.hz_to_midi(f0) 52 | if pitch.shape[0] < total_secs / timestep: 53 | pad = math.ceil(total_secs / timestep) - pitch.shape[0] 54 | pitch = np.pad(pitch, [0, pad], mode='constant', constant_values=[0, pitch[-1]]) 55 | uv = np.pad(uv, [0, pad], mode='constant') 56 | 57 | note_seq = [] 58 | note_dur = [] 59 | start = 0. 60 | for dur in word_dur: 61 | end = start + dur 62 | start_idx = math.floor(start / timestep) 63 | end_idx = math.ceil(end / timestep) 64 | word_pitch = pitch[start_idx: end_idx] 65 | word_uv = uv[start_idx: end_idx] 66 | word_valid_pitch = np.extract(~word_uv & (word_pitch >= 0), word_pitch) 67 | if len(word_valid_pitch) < (1 - rest_uv_ratio) * (end_idx - start_idx): 68 | note_seq.append('rest') 69 | else: 70 | counts = np.bincount(np.round(word_valid_pitch).astype(np.int64)) 71 | midi = counts.argmax() 72 | midi = np.mean(word_valid_pitch[(word_valid_pitch >= midi - 0.5) & (word_valid_pitch < midi + 0.5)]) 73 | note_seq.append(librosa.midi_to_note(midi, cents=True, unicode=False)) 74 | note_dur.append(dur) 75 | 76 | start = end 77 | 78 | item['note_seq'] = ' '.join(note_seq) 79 | item['note_dur'] = ' '.join([str(round(d, 6)) for d in note_dur]) 80 | 81 | with open(transcriptions, 'w', encoding='utf8', newline='') as f: 82 | writer = csv.DictWriter(f, fieldnames=['name', 'ph_seq', 'ph_dur', 'ph_num', 'note_seq', 'note_dur']) 83 | writer.writeheader() 84 | writer.writerows(items) 85 | 86 | 87 | if __name__ == '__main__': 88 | estimate_midi() 89 | -------------------------------------------------------------------------------- /variance-temp-solution/get_pitch.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | import numpy as np 4 | import parselmouth 5 | 6 | 7 | def norm_f0(f0): 8 | f0 = np.log2(f0) 9 | return f0 10 | 11 | 12 | def denorm_f0(f0, uv, pitch_padding=None): 13 | f0 = 2 ** f0 14 | if uv is not None: 15 | f0[uv > 0] = 0 16 | if pitch_padding is not None: 17 | f0[pitch_padding] = 0 18 | return f0 19 | 20 | 21 | def interp_f0(f0, uv=None): 22 | if uv is None: 23 | uv = f0 == 0 24 | f0 = norm_f0(f0) 25 | if sum(uv) == len(f0): 26 | f0[uv] = -np.inf 27 | elif sum(uv) > 0: 28 | f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv]) 29 | return denorm_f0(f0, uv=None), uv 30 | 31 | 32 | def resample_align_curve(points: np.ndarray, original_timestep: float, target_timestep: float, align_length: int): 33 | t_max = (len(points) - 1) * original_timestep 34 | curve_interp = np.interp( 35 | np.arange(0, t_max, target_timestep), 36 | original_timestep * np.arange(len(points)), 37 | points 38 | ).astype(points.dtype) 39 | delta_l = align_length - len(curve_interp) 40 | if delta_l < 0: 41 | curve_interp = curve_interp[:align_length] 42 | elif delta_l > 0: 43 | curve_interp = np.concatenate((curve_interp, np.full(delta_l, fill_value=curve_interp[-1])), axis=0) 44 | return curve_interp 45 | 46 | 47 | def get_pitch_parselmouth(wav_data, hop_size, audio_sample_rate, interp_uv=True): 48 | time_step = hop_size / audio_sample_rate 49 | f0_min = 65. 50 | f0_max = 1100. 51 | 52 | # noinspection PyArgumentList 53 | f0 = ( 54 | parselmouth.Sound(wav_data, sampling_frequency=audio_sample_rate) 55 | .to_pitch_ac( 56 | time_step=time_step, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max 57 | ).selected_array["frequency"] 58 | ) 59 | uv = f0 == 0 60 | if interp_uv: 61 | f0, uv = interp_f0(f0, uv) 62 | return time_step, f0, uv 63 | 64 | 65 | rmvpe = None 66 | 67 | 68 | def get_pitch_rmvpe(wav_data, hop_size, audio_sample_rate, interp_uv=True): 69 | global rmvpe 70 | if rmvpe is None: 71 | from rmvpe import RMVPE 72 | rmvpe = RMVPE(pathlib.Path(__file__).parent / 'assets' / 'rmvpe' / 'model.pt') 73 | f0 = rmvpe.infer_from_audio(wav_data, sample_rate=audio_sample_rate) 74 | uv = f0 == 0 75 | f0, uv = interp_f0(f0, uv) 76 | 77 | time_step = hop_size / audio_sample_rate 78 | length = (wav_data.shape[0] + hop_size - 1) // hop_size 79 | f0_res = resample_align_curve(f0, 0.01, time_step, length) 80 | uv_res = resample_align_curve(uv.astype(np.float32), 0.01, time_step, length) > 0.5 81 | if not interp_uv: 82 | f0_res[uv_res] = 0 83 | return time_step, f0_res, uv_res 84 | 85 | 86 | def get_pitch(algorithm, wav_data, hop_size, audio_sample_rate, interp_uv=True): 87 | if algorithm == 'parselmouth': 88 | return get_pitch_parselmouth(wav_data, hop_size, audio_sample_rate, interp_uv=interp_uv) 89 | elif algorithm == 'rmvpe': 90 | return get_pitch_rmvpe(wav_data, hop_size, audio_sample_rate, interp_uv=interp_uv) 91 | else: 92 | raise ValueError(f" [x] Unknown f0 extractor: {algorithm}") 93 | -------------------------------------------------------------------------------- /variance-temp-solution/requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | librosa<0.10.0 3 | numpy==1.23.5 4 | praat-parselmouth==0.4.3 5 | tqdm 6 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import RMVPE 2 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/constants.py: -------------------------------------------------------------------------------- 1 | SAMPLE_RATE = 16000 2 | 3 | N_CLASS = 360 4 | 5 | N_MELS = 128 6 | MEL_FMIN = 30 7 | MEL_FMAX = 8000 8 | WINDOW_LENGTH = 1024 9 | CONST = 1997.3794084376191 10 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/deepunet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .constants import N_MELS 4 | 5 | 6 | class ConvBlockRes(nn.Module): 7 | def __init__(self, in_channels, out_channels, momentum=0.01): 8 | super(ConvBlockRes, self).__init__() 9 | self.conv = nn.Sequential( 10 | nn.Conv2d(in_channels=in_channels, 11 | out_channels=out_channels, 12 | kernel_size=(3, 3), 13 | stride=(1, 1), 14 | padding=(1, 1), 15 | bias=False), 16 | nn.BatchNorm2d(out_channels, momentum=momentum), 17 | nn.ReLU(), 18 | 19 | nn.Conv2d(in_channels=out_channels, 20 | out_channels=out_channels, 21 | kernel_size=(3, 3), 22 | stride=(1, 1), 23 | padding=(1, 1), 24 | bias=False), 25 | nn.BatchNorm2d(out_channels, momentum=momentum), 26 | nn.ReLU(), 27 | ) 28 | if in_channels != out_channels: 29 | self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) 30 | self.is_shortcut = True 31 | else: 32 | self.is_shortcut = False 33 | 34 | def forward(self, x): 35 | if self.is_shortcut: 36 | return self.conv(x) + self.shortcut(x) 37 | else: 38 | return self.conv(x) + x 39 | 40 | 41 | class ResEncoderBlock(nn.Module): 42 | def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01): 43 | super(ResEncoderBlock, self).__init__() 44 | self.n_blocks = n_blocks 45 | self.conv = nn.ModuleList() 46 | self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) 47 | for i in range(n_blocks - 1): 48 | self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) 49 | self.kernel_size = kernel_size 50 | if self.kernel_size is not None: 51 | self.pool = nn.AvgPool2d(kernel_size=kernel_size) 52 | 53 | def forward(self, x): 54 | for i in range(self.n_blocks): 55 | x = self.conv[i](x) 56 | if self.kernel_size is not None: 57 | return x, self.pool(x) 58 | else: 59 | return x 60 | 61 | 62 | class ResDecoderBlock(nn.Module): 63 | def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): 64 | super(ResDecoderBlock, self).__init__() 65 | out_padding = (0, 1) if stride == (1, 2) else (1, 1) 66 | self.n_blocks = n_blocks 67 | self.conv1 = nn.Sequential( 68 | nn.ConvTranspose2d(in_channels=in_channels, 69 | out_channels=out_channels, 70 | kernel_size=(3, 3), 71 | stride=stride, 72 | padding=(1, 1), 73 | output_padding=out_padding, 74 | bias=False), 75 | nn.BatchNorm2d(out_channels, momentum=momentum), 76 | nn.ReLU(), 77 | ) 78 | self.conv2 = nn.ModuleList() 79 | self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) 80 | for i in range(n_blocks-1): 81 | self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) 82 | 83 | def forward(self, x, concat_tensor): 84 | x = self.conv1(x) 85 | x = torch.cat((x, concat_tensor), dim=1) 86 | for i in range(self.n_blocks): 87 | x = self.conv2[i](x) 88 | return x 89 | 90 | 91 | class Encoder(nn.Module): 92 | def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01): 93 | super(Encoder, self).__init__() 94 | self.n_encoders = n_encoders 95 | self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) 96 | self.layers = nn.ModuleList() 97 | self.latent_channels = [] 98 | for i in range(self.n_encoders): 99 | self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum)) 100 | self.latent_channels.append([out_channels, in_size]) 101 | in_channels = out_channels 102 | out_channels *= 2 103 | in_size //= 2 104 | self.out_size = in_size 105 | self.out_channel = out_channels 106 | 107 | def forward(self, x): 108 | concat_tensors = [] 109 | x = self.bn(x) 110 | for i in range(self.n_encoders): 111 | _, x = self.layers[i](x) 112 | concat_tensors.append(_) 113 | return x, concat_tensors 114 | 115 | 116 | class Intermediate(nn.Module): 117 | def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): 118 | super(Intermediate, self).__init__() 119 | self.n_inters = n_inters 120 | self.layers = nn.ModuleList() 121 | self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)) 122 | for i in range(self.n_inters-1): 123 | self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)) 124 | 125 | def forward(self, x): 126 | for i in range(self.n_inters): 127 | x = self.layers[i](x) 128 | return x 129 | 130 | 131 | class Decoder(nn.Module): 132 | def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): 133 | super(Decoder, self).__init__() 134 | self.layers = nn.ModuleList() 135 | self.n_decoders = n_decoders 136 | for i in range(self.n_decoders): 137 | out_channels = in_channels // 2 138 | self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)) 139 | in_channels = out_channels 140 | 141 | def forward(self, x, concat_tensors): 142 | for i in range(self.n_decoders): 143 | x = self.layers[i](x, concat_tensors[-1-i]) 144 | return x 145 | 146 | 147 | class TimbreFilter(nn.Module): 148 | def __init__(self, latent_rep_channels): 149 | super(TimbreFilter, self).__init__() 150 | self.layers = nn.ModuleList() 151 | for latent_rep in latent_rep_channels: 152 | self.layers.append(ConvBlockRes(latent_rep[0], latent_rep[0])) 153 | 154 | def forward(self, x_tensors): 155 | out_tensors = [] 156 | for i, layer in enumerate(self.layers): 157 | out_tensors.append(layer(x_tensors[i])) 158 | return out_tensors 159 | 160 | 161 | class DeepUnet0(nn.Module): 162 | def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16): 163 | super(DeepUnet0, self).__init__() 164 | self.encoder = Encoder(in_channels, N_MELS, en_de_layers, kernel_size, n_blocks, en_out_channels) 165 | self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks) 166 | self.tf = TimbreFilter(self.encoder.latent_channels) 167 | self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks) 168 | 169 | def forward(self, x): 170 | x, concat_tensors = self.encoder(x) 171 | x = self.intermediate(x) 172 | x = self.decoder(x, concat_tensors) 173 | return x 174 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torchaudio.transforms import Resample 4 | 5 | from .constants import * 6 | from .model import E2E0 7 | from .spec import MelSpectrogram 8 | from .utils import to_local_average_f0, to_viterbi_f0 9 | 10 | 11 | class RMVPE: 12 | def __init__(self, model_path, hop_length=160): 13 | self.resample_kernel = {} 14 | self.device = 'cuda' if torch.cuda.is_available() else 'cpu' 15 | self.model = E2E0(4, 1, (2, 2)).eval().to(self.device) 16 | ckpt = torch.load(model_path, map_location=self.device) 17 | self.model.load_state_dict(ckpt['model'], strict=False) 18 | self.mel_extractor = MelSpectrogram( 19 | N_MELS, SAMPLE_RATE, WINDOW_LENGTH, hop_length, None, MEL_FMIN, MEL_FMAX 20 | ).to(self.device) 21 | 22 | @torch.no_grad() 23 | def mel2hidden(self, mel): 24 | n_frames = mel.shape[-1] 25 | mel = F.pad(mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode='constant') 26 | hidden = self.model(mel) 27 | return hidden[:, :n_frames] 28 | 29 | def decode(self, hidden, thred=0.03, use_viterbi=False): 30 | if use_viterbi: 31 | f0 = to_viterbi_f0(hidden, thred=thred) 32 | else: 33 | f0 = to_local_average_f0(hidden, thred=thred) 34 | return f0 35 | 36 | def infer_from_audio(self, audio, sample_rate=16000, thred=0.03, use_viterbi=False): 37 | audio = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) 38 | if sample_rate == 16000: 39 | audio_res = audio 40 | else: 41 | key_str = str(sample_rate) 42 | if key_str not in self.resample_kernel: 43 | self.resample_kernel[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128) 44 | self.resample_kernel[key_str] = self.resample_kernel[key_str].to(self.device) 45 | audio_res = self.resample_kernel[key_str](audio) 46 | mel = self.mel_extractor(audio_res, center=True) 47 | hidden = self.mel2hidden(mel) 48 | f0 = self.decode(hidden, thred=thred, use_viterbi=use_viterbi) 49 | return f0 50 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/model.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from .constants import * 4 | from .deepunet import DeepUnet0 5 | from .seq import BiGRU 6 | 7 | 8 | class E2E0(nn.Module): 9 | def __init__(self, n_blocks, n_gru, kernel_size, en_de_layers=5, inter_layers=4, in_channels=1, 10 | en_out_channels=16): 11 | super(E2E0, self).__init__() 12 | self.unet = DeepUnet0(kernel_size, n_blocks, en_de_layers, inter_layers, in_channels, en_out_channels) 13 | self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) 14 | if n_gru: 15 | self.fc = nn.Sequential( 16 | BiGRU(3 * N_MELS, 256, n_gru), 17 | nn.Linear(512, N_CLASS), 18 | nn.Dropout(0.25), 19 | nn.Sigmoid() 20 | ) 21 | else: 22 | self.fc = nn.Sequential( 23 | nn.Linear(3 * N_MELS, N_CLASS), 24 | nn.Dropout(0.25), 25 | nn.Sigmoid() 26 | ) 27 | 28 | def forward(self, mel): 29 | mel = mel.transpose(-1, -2).unsqueeze(1) 30 | x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) 31 | x = self.fc(x) 32 | return x 33 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/seq.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class BiGRU(nn.Module): 5 | def __init__(self, input_features, hidden_features, num_layers): 6 | super(BiGRU, self).__init__() 7 | self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True) 8 | 9 | def forward(self, x): 10 | return self.gru(x)[0] 11 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/spec.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn.functional as F 4 | from librosa.filters import mel 5 | 6 | 7 | class MelSpectrogram(torch.nn.Module): 8 | def __init__( 9 | self, 10 | n_mel_channels, 11 | sampling_rate, 12 | win_length, 13 | hop_length, 14 | n_fft=None, 15 | mel_fmin=0, 16 | mel_fmax=None, 17 | clamp=1e-5 18 | ): 19 | super().__init__() 20 | n_fft = win_length if n_fft is None else n_fft 21 | self.hann_window = {} 22 | mel_basis = mel( 23 | sr=sampling_rate, 24 | n_fft=n_fft, 25 | n_mels=n_mel_channels, 26 | fmin=mel_fmin, 27 | fmax=mel_fmax, 28 | htk=True) 29 | mel_basis = torch.from_numpy(mel_basis).float() 30 | self.register_buffer("mel_basis", mel_basis) 31 | self.n_fft = win_length if n_fft is None else n_fft 32 | self.hop_length = hop_length 33 | self.win_length = win_length 34 | self.sampling_rate = sampling_rate 35 | self.n_mel_channels = n_mel_channels 36 | self.clamp = clamp 37 | 38 | def forward(self, audio, keyshift=0, speed=1, center=True): 39 | factor = 2 ** (keyshift / 12) 40 | n_fft_new = int(np.round(self.n_fft * factor)) 41 | win_length_new = int(np.round(self.win_length * factor)) 42 | hop_length_new = int(np.round(self.hop_length * speed)) 43 | 44 | keyshift_key = str(keyshift) + '_' + str(audio.device) 45 | if keyshift_key not in self.hann_window: 46 | self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(audio.device) 47 | 48 | fft = torch.stft( 49 | audio, 50 | n_fft=n_fft_new, 51 | hop_length=hop_length_new, 52 | win_length=win_length_new, 53 | window=self.hann_window[keyshift_key], 54 | center=center, 55 | return_complex=True 56 | ) 57 | magnitude = fft.abs() 58 | 59 | if keyshift != 0: 60 | size = self.n_fft // 2 + 1 61 | resize = magnitude.size(1) 62 | if resize < size: 63 | magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) 64 | magnitude = magnitude[:, :size, :] * self.win_length / win_length_new 65 | 66 | mel_output = torch.matmul(self.mel_basis, magnitude) 67 | log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) 68 | return log_mel_spec 69 | -------------------------------------------------------------------------------- /variance-temp-solution/rmvpe/utils.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | import torch 4 | 5 | from .constants import * 6 | 7 | 8 | def to_local_average_f0(hidden, center=None, thred=0.03): 9 | idx = torch.arange(N_CLASS, device=hidden.device)[None, None, :] # [B=1, T=1, N] 10 | idx_cents = idx * 20 + CONST # [B=1, N] 11 | if center is None: 12 | center = torch.argmax(hidden, dim=2, keepdim=True) # [B, T, 1] 13 | start = torch.clip(center - 4, min=0) # [B, T, 1] 14 | end = torch.clip(center + 5, max=N_CLASS) # [B, T, 1] 15 | idx_mask = (idx >= start) & (idx < end) # [B, T, N] 16 | weights = hidden * idx_mask # [B, T, N] 17 | product_sum = torch.sum(weights * idx_cents, dim=2) # [B, T] 18 | weight_sum = torch.sum(weights, dim=2) # [B, T] 19 | cents = product_sum / (weight_sum + (weight_sum == 0)) # avoid dividing by zero, [B, T] 20 | f0 = 10 * 2 ** (cents / 1200) 21 | uv = hidden.max(dim=2)[0] < thred # [B, T] 22 | f0 = f0 * ~uv 23 | return f0.squeeze(0).cpu().numpy() 24 | 25 | 26 | def to_viterbi_f0(hidden, thred=0.03): 27 | # Create viterbi transition matrix 28 | if not hasattr(to_viterbi_f0, 'transition'): 29 | xx, yy = np.meshgrid(range(N_CLASS), range(N_CLASS)) 30 | transition = np.maximum(30 - abs(xx - yy), 0) 31 | transition = transition / transition.sum(axis=1, keepdims=True) 32 | to_viterbi_f0.transition = transition 33 | 34 | # Convert to probability 35 | prob = hidden.squeeze(0).cpu().numpy() 36 | prob = prob.T 37 | prob = prob / prob.sum(axis=0) 38 | 39 | # Perform viterbi decoding 40 | path = librosa.sequence.viterbi(prob, to_viterbi_f0.transition).astype(np.int64) 41 | center = torch.from_numpy(path).unsqueeze(0).unsqueeze(-1).to(hidden.device) 42 | 43 | return to_local_average_f0(hidden, center=center, thred=thred) 44 | --------------------------------------------------------------------------------