├── .gitignore ├── LICENSE ├── README.md ├── config.json ├── config_examples ├── config_metricGAN.json ├── config_metricGAN_low_snr.json ├── config_pix2pix_test.json ├── config_pix2pix_train.json └── config_unet_gan.json ├── frequency_domain_0.py ├── frequency_domain_1.py ├── mask_0.py ├── time_domain.py ├── time_domain_wav.py └── utils ├── __init__.py ├── extract_features.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | !data/data/.gitkeep 2 | dist*/ 3 | .ipynb_checkpoints/ 4 | .vscode 5 | data 6 | snippet 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 郝翔 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Build Speech Enhancement Dataset 2 | 3 | Build speech enhancement dataset. 4 | 5 | ## Dependencies 6 | 7 | - tqdm 8 | - pytorch 9 | - librosa 10 | 11 | ## Supported features 12 | 13 | - time_domain: Speech level. The noisy waveform corresponds to the clean waveform. 14 | - time_domain_wav: Same as above, except that it will save the speech separately, instead of storing all the speech in the .pkl file. 15 | - frequency_domain_0: Speech level. The noisy spectrum corresponds to the clean spectrum, and they are the same size. 16 | - frequency_domain_1: Frame level. The noisy spectrum has multi-frames, and the clean speech is one frame. The center frame of the noisy spectrum is aligned with the frame of the clean speech. 17 | - frequency_domain_2: Frame level. The noisy spectrum is multi-frames, and the clean speech is multi-frames. They are the same numbers of frames. 18 | - mask_0: Frame level. The noisy spectrum has multi-frames, and the mask is one frame. The center frame of the noisy spectrum is aligned with the frame of the mask. 19 | 20 | ## 使用 21 | 22 | ```shell 23 | python [time_domain.py| time_domain_wav.py |frequency_domain_0.py|frequency_domain_1.py|mask_0.py] -C config.json 24 | ``` 25 | 26 | ## ToDo 27 | 28 | - [x] Replace .npy for a more efficient format 29 | - [ ] Add more param with extracting spectrum 30 | - [ ] 添加 count 参数来配合 min_sampling 31 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": [ 3 | { 4 | "name": "train", 5 | "clean": { 6 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN", 7 | "sampling_rate": 16000, 8 | "min_sampling": 16384, 9 | "ext": "WAV", 10 | "recurse": true, 11 | "limit": 400, 12 | "offset": 0 13 | }, 14 | "noise": { 15 | "database": "./data/noise", 16 | "sampling_rate": 16000, 17 | "types": [ 18 | "destroyerops", 19 | "f16", 20 | "factoryfloor2", 21 | "leopard", 22 | "m109" 23 | ] 24 | }, 25 | "snr": [ 26 | 0, 27 | -5 28 | ] 29 | }, 30 | { 31 | "name": "test", 32 | "clean": { 33 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TEST", 34 | "sampling_rate": 16000, 35 | "min_sampling": 16384, 36 | "ext": "WAV", 37 | "recurse": true, 38 | "limit": 50, 39 | "offset": 0 40 | }, 41 | "noise": { 42 | "database": "./data/noise", 43 | "sampling_rate": 16000, 44 | "types": [ 45 | "babble", 46 | "factoryfloor1", 47 | "destroyerengine" 48 | ] 49 | }, 50 | "snr": [ 51 | -5, 52 | 0, 53 | 5 54 | ] 55 | } 56 | ] 57 | } 58 | -------------------------------------------------------------------------------- /config_examples/config_metricGAN.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": [ 3 | { 4 | "name": "Train", 5 | "clean": { 6 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN", 7 | "sampling_rate": 16000, 8 | "min_sampling": 16384, 9 | "ext": "WAV", 10 | "recurse": true, 11 | "limit": 300, 12 | "offset": 0 13 | }, 14 | "noise": { 15 | "database": "./data/Nonspeech", 16 | "sampling_rate": 16000, 17 | "types": [ 18 | "n1", 19 | "n18", 20 | "n20", 21 | "n30", 22 | "n46", 23 | "n51", 24 | "n56", 25 | "n72", 26 | "n79", 27 | "n88" 28 | ] 29 | }, 30 | "snr": [ 31 | -8, 32 | -4, 33 | -0, 34 | 4, 35 | 8 36 | ] 37 | }, 38 | { 39 | "name": "Test", 40 | "clean": { 41 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN", 42 | "sampling_rate": 16000, 43 | "min_sampling": 16384, 44 | "ext": "WAV", 45 | "recurse": true, 46 | "limit": 100, 47 | "offset": 400 48 | }, 49 | "noise": { 50 | "database": "./data/Nonspeech", 51 | "sampling_rate": 16000, 52 | "types": [ 53 | "n2", 54 | "n19", 55 | "n21", 56 | "n31", 57 | "n47", 58 | "n52", 59 | "n57", 60 | "n73", 61 | "n80", 62 | "n89" 63 | ] 64 | }, 65 | "snr": [ 66 | -10, 67 | -5, 68 | 0, 69 | 5, 70 | 10 71 | ] 72 | } 73 | ] 74 | } 75 | -------------------------------------------------------------------------------- /config_examples/config_metricGAN_low_snr.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": [ 3 | { 4 | "name": "Train", 5 | "clean": { 6 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN", 7 | "sampling_rate": 16000, 8 | "min_sampling": 16384, 9 | "ext": "WAV", 10 | "recurse": true, 11 | "limit": 300, 12 | "offset": 0 13 | }, 14 | "noise": { 15 | "database": "./data/Nonspeech", 16 | "sampling_rate": 16000, 17 | "types": [ 18 | "n1", 19 | "n18", 20 | "n20", 21 | "n30", 22 | "n46", 23 | "n51", 24 | "n56", 25 | "n72", 26 | "n79", 27 | "n88" 28 | ] 29 | }, 30 | "snr": [ 31 | -15, 32 | -10, 33 | -5, 34 | 0, 35 | 5 36 | ] 37 | }, 38 | { 39 | "name": "Test", 40 | "clean": { 41 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN", 42 | "sampling_rate": 16000, 43 | "min_sampling": 16384, 44 | "ext": "WAV", 45 | "recurse": true, 46 | "limit": 100, 47 | "offset": 400 48 | }, 49 | "noise": { 50 | "database": "./data/Nonspeech", 51 | "sampling_rate": 16000, 52 | "types": [ 53 | "n2", 54 | "n19", 55 | "n21", 56 | "n31", 57 | "n47", 58 | "n52", 59 | "n57", 60 | "n73", 61 | "n80", 62 | "n89" 63 | ] 64 | }, 65 | "snr": [ 66 | -20, 67 | -15, 68 | -10, 69 | -5, 70 | 0 71 | ] 72 | } 73 | ] 74 | } 75 | -------------------------------------------------------------------------------- /config_examples/config_pix2pix_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": [ 3 | { 4 | "name": "test", 5 | "clean": { 6 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TEST", 7 | "sampling_rate": 16000, 8 | "min_sampling": 65536, 9 | "_comment": "stft(65536)=257,257,最后一帧是 pad 出来的,不确定好坏,先考虑丢掉", 10 | "ext": "WAV", 11 | "recurse": true, 12 | "limit": 500, 13 | "offset": 0 14 | }, 15 | "noise": { 16 | "database": "./data/noise", 17 | "sampling_rate": 16000, 18 | "types": [ 19 | "babble", 20 | "factoryfloor1", 21 | "destroyerops", 22 | "destroyerengine", 23 | "factoryfloor2" 24 | ] 25 | }, 26 | "snr": [ 27 | 0, 28 | -3, 29 | -5, 30 | -7, 31 | -10, 32 | -12, 33 | -15, 34 | -17, 35 | -20 36 | ] 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /config_examples/config_pix2pix_train.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": [ 3 | { 4 | "name": "train", 5 | "clean": { 6 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN", 7 | "sampling_rate": 16000, 8 | "min_sampling": 65536, 9 | "_comment": "stft(65536)=257,257,最后一帧是 pad 出来的,不确定好坏,先考虑丢掉", 10 | "ext": "WAV", 11 | "recurse": true, 12 | "limit": 2400, 13 | "offset": 0 14 | }, 15 | "noise": { 16 | "database": "./data/noise", 17 | "sampling_rate": 16000, 18 | "types": [ 19 | "babble", 20 | "factoryfloor1", 21 | "destroyerops", 22 | "destroyerengine" 23 | ] 24 | }, 25 | "snr": [ 26 | 0, 27 | -5, 28 | -10, 29 | -15 30 | ] 31 | } 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /config_examples/config_unet_gan.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset": [ 3 | { 4 | "name": "train", 5 | "clean": { 6 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN", 7 | "sampling_rate": 16000, 8 | "min_sampling": 16384, 9 | "ext": "WAV", 10 | "recurse": true, 11 | "limit": 600, 12 | "offset": 0 13 | }, 14 | "noise": { 15 | "database": "./data/noise", 16 | "sampling_rate": 16000, 17 | "types": [ 18 | "babble", 19 | "factoryfloor1", 20 | "destroyerops", 21 | "destroyerengine" 22 | ] 23 | }, 24 | "snr": [ 25 | 0, 26 | -5, 27 | -10, 28 | -15 29 | ] 30 | }, 31 | { 32 | "name": "test", 33 | "clean": { 34 | "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TEST", 35 | "sampling_rate": 16000, 36 | "min_sampling": 16384, 37 | "ext": "WAV", 38 | "recurse": true, 39 | "limit": 100, 40 | "offset": 0 41 | }, 42 | "noise": { 43 | "database": "./data/noise", 44 | "sampling_rate": 16000, 45 | "types": [ 46 | "babble", 47 | "factoryfloor1", 48 | "destroyerops", 49 | "destroyerengine", 50 | "factoryfloor2" 51 | ] 52 | }, 53 | "snr": [ 54 | 0, 55 | -3, 56 | -5, 57 | -7, 58 | -10, 59 | -12, 60 | -15, 61 | -17, 62 | -20 63 | ] 64 | } 65 | ] 66 | } 67 | -------------------------------------------------------------------------------- /frequency_domain_0.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | from pathlib import Path 5 | 6 | import joblib 7 | import librosa 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from utils.extract_features import lps 12 | from utils.utils import (add_noise_for_waveform, 13 | corrected_the_length_of_noise_and_clean_speech, 14 | load_wavs, prepare_empty_dirs) 15 | 16 | 17 | def main(config, random_seed, dist): 18 | """ 19 | 构建*频域*上的语音增强数据集(Log Power Spectrum) 20 | 数据集为语句级别,带噪语音和它相应纯净语音的频谱尺寸相同 21 | 22 | Steps: 23 | 1. 加载纯净语音信号 24 | 2. 加载噪声文件 25 | 3. 在纯净语音信号上叠加噪声信号 26 | 4. 分别计算 LPS 特征 27 | 5. 分别存储带噪语音与纯净语音 28 | 29 | Args: 30 | config (dict): 配置信息 31 | random_seed (int): 随机种子 32 | dist (str): 输出结果的目录 33 | 34 | Dataset: 35 | dataset_1/ 36 | mixture.npy 37 | clean.npy 38 | ... 39 | 40 | mixture.npy is { 41 | "0001_babble_-5": (257, T), 42 | "0001_babble_-10": (257, T), 43 | ... 44 | } 45 | 46 | clean.npy is { 47 | "0001": (257, T), 48 | "0002": (257, T), 49 | ... 50 | } 51 | """ 52 | global clean_lps 53 | np.random.seed(random_seed) 54 | dist_dir = Path(dist) 55 | 56 | # 以遍历的方式读取 config.json 中各个数据集的配置项 57 | for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): 58 | dataset_dir = dist_dir / dataset_cfg["name"] 59 | prepare_empty_dirs([dataset_dir]) 60 | print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) 61 | 62 | # 加载纯净语音信号,存至 list 中 63 | clean_cfg = dataset_cfg["clean"] 64 | clean_speech_paths = librosa.util.find_files( 65 | directory=clean_cfg["database"], 66 | ext=clean_cfg["ext"], 67 | recurse=clean_cfg["recurse"], 68 | limit=clean_cfg["limit"], 69 | offset=clean_cfg["offset"] 70 | ) 71 | random.shuffle(clean_speech_paths) 72 | clean_ys = load_wavs( 73 | file_paths=clean_speech_paths, 74 | sr=clean_cfg["sampling_rate"], 75 | min_sampling=clean_cfg["min_sampling"], 76 | ) 77 | print("Loaded clean speeches.") 78 | 79 | # 加载噪声信号,存至 dict 中 80 | noise_cfg = dataset_cfg["noise"] 81 | noise_database_dir = Path(noise_cfg["database"]) 82 | noise_ys = {} 83 | for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): 84 | mixture, _ = librosa.load( 85 | (noise_database_dir / (noise_type + ".wav")).as_posix(), 86 | sr=noise_cfg["sampling_rate"]) 87 | noise_ys[noise_type] = mixture 88 | print("Loaded noise.") 89 | 90 | # 合成带噪语音 91 | mixture_store = {} 92 | clean_store = {} 93 | for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): 94 | num = str(i).zfill(4) 95 | for snr in dataset_cfg["snr"]: 96 | for noise_type in noise_ys.keys(): 97 | basename_text = f"{num}_{noise_type}_{snr}" 98 | 99 | clean, noise = corrected_the_length_of_noise_and_clean_speech( 100 | clean_y=clean, 101 | noise_y=noise_ys[noise_type] 102 | ) 103 | 104 | mixture = add_noise_for_waveform(clean, noise, int(snr)) 105 | assert len(mixture) == len(clean) == len(noise) 106 | 107 | mixture_lps = lps(mixture) 108 | clean_lps = lps(clean) 109 | 110 | assert mixture_lps.shape[0] == clean_lps.shape[0] == 257 111 | mixture_store[basename_text] = mixture_lps 112 | 113 | clean_store[num] = clean_lps 114 | 115 | print(f"Synthesize finished,storing file...") 116 | joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix()) 117 | joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix()) 118 | 119 | if __name__ == "__main__": 120 | parser = argparse.ArgumentParser(description="合成频域带噪语音") 121 | parser.add_argument("-C", "--config", required=True, type=str, help="配置文件") 122 | parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子") 123 | parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录") 124 | args = parser.parse_args() 125 | 126 | config = json.load(open(args.config)) 127 | main(config, args.random_seed, args.dist) 128 | -------------------------------------------------------------------------------- /frequency_domain_1.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | from pathlib import Path 5 | 6 | import joblib 7 | import librosa 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from utils.extract_features import lps 12 | from utils.utils import (add_noise_for_waveform, 13 | corrected_the_length_of_noise_and_clean_speech, 14 | load_wavs, prepare_empty_dirs, unfold_spectrum) 15 | 16 | 17 | def main(config, random_seed, dist, n_pad): 18 | """ 19 | 构建*频域*上的语音增强数据集(Log Power Spectrum) 20 | 每句带噪语音的时间步上都包含多帧,多帧的中心帧对应这个时间步上的一帧纯净语音 21 | 中心帧前面的时间帧: 22 | 中心帧后面的时间帧: 23 | TODO 文档等待进一步更新 24 | 25 | Steps: 26 | 1. 加载纯净语音信号 27 | 2. 加载噪声文件 28 | 3. 在纯净语音信号上叠加噪声信号 29 | 4. 分别计算 LPS 特征 30 | 5. 将带噪语音的 LPS 特征进行拓展 31 | 5. 分别存储带噪语音与纯净语音 32 | 33 | Args: 34 | config (dict): 配置信息 35 | random_seed (int): 随机种子 36 | dist (str): 输出结果的目录 37 | n_pad (int): 带噪语音的拓展大小 38 | 39 | Dataset: 40 | dataset_1/ 41 | mixture.npy 42 | clean.npy 43 | ... 44 | 45 | mixture.npy is { 46 | "0001_babble_-5": (257 * 3 * , T), 47 | "0001_babble_-10": (257 * 3, T), 48 | ... 49 | } 50 | 51 | clean.npy is { 52 | "0001": (257, T), 53 | "0002": (257, T), 54 | ... 55 | } 56 | """ 57 | global clean_lps 58 | np.random.seed(random_seed) 59 | dist_dir = Path(dist) 60 | 61 | # 以遍历的方式读取 config.json 中各个数据集的配置项 62 | for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): 63 | dataset_dir = dist_dir / dataset_cfg["name"] 64 | prepare_empty_dirs([dataset_dir]) 65 | print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) 66 | 67 | # 加载纯净语音信号,存至 list 中 68 | clean_cfg = dataset_cfg["clean"] 69 | clean_speech_paths = librosa.util.find_files( 70 | directory=clean_cfg["database"], 71 | ext=clean_cfg["ext"], 72 | recurse=clean_cfg["recurse"], 73 | limit=clean_cfg["limit"], 74 | offset=clean_cfg["offset"] 75 | ) 76 | random.shuffle(clean_speech_paths) 77 | clean_ys = load_wavs( 78 | file_paths=clean_speech_paths, 79 | sr=clean_cfg["sampling_rate"], 80 | min_sampling=clean_cfg["min_sampling"], 81 | ) 82 | print("Loaded clean speeches.") 83 | 84 | # 加载噪声信号,存至 dict 中 85 | noise_cfg = dataset_cfg["noise"] 86 | noise_database_dir = Path(noise_cfg["database"]) 87 | noise_ys = {} 88 | for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): 89 | mixture, _ = librosa.load( 90 | (noise_database_dir / (noise_type + ".wav")).as_posix(), 91 | sr=noise_cfg["sampling_rate"]) 92 | noise_ys[noise_type] = mixture 93 | print("Loaded noise.") 94 | 95 | # 合成带噪语音 96 | mixture_store = {} 97 | clean_store = {} 98 | for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): 99 | num = str(i).zfill(4) 100 | for snr in dataset_cfg["snr"]: 101 | for noise_type in noise_ys.keys(): 102 | basename_text = f"{num}_{noise_type}_{snr}" 103 | 104 | clean, noise = corrected_the_length_of_noise_and_clean_speech( 105 | clean_y=clean, 106 | noise_y=noise_ys[noise_type] 107 | ) 108 | 109 | mixture = add_noise_for_waveform(clean, noise, int(snr)) 110 | assert len(mixture) == len(clean) == len(noise) 111 | 112 | mixture_lps = lps(mixture) 113 | clean_lps = lps(clean) 114 | mixture_lps = unfold_spectrum(mixture_lps, n_pad=n_pad) 115 | 116 | assert mixture_lps.shape[0] == clean_lps.shape[0] == 257 117 | mixture_store[basename_text] = mixture_lps 118 | 119 | clean_store[num] = clean_lps 120 | 121 | print(f"Synthesize finished,storing file...") 122 | joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix()) 123 | joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix()) 124 | 125 | 126 | if __name__ == "__main__": 127 | parser = argparse.ArgumentParser(description="合成频域带噪语音(可拓展帧)") 128 | parser.add_argument("-C", "--config", required=True, type=str, help="配置文件") 129 | parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子") 130 | parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录") 131 | parser.add_argument("-P", "--n_pad", default=3, type=int, help="带噪语音需要拓展的大小") 132 | args = parser.parse_args() 133 | 134 | config = json.load(open(args.config)) 135 | main(config, args.random_seed, args.dist, args.n_pad) 136 | -------------------------------------------------------------------------------- /mask_0.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | from pathlib import Path 5 | 6 | import joblib 7 | import librosa 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from utils.extract_features import mag 12 | from utils.utils import (add_noise_for_waveform, 13 | corrected_the_length_of_noise_and_clean_speech, 14 | load_wavs, prepare_empty_dirs, input_normalization, unfold_spectrum) 15 | 16 | 17 | def main(config, random_seed, dist, apply_normalization, n_pad): 18 | """ 19 | 构建 IRM(Ideal ratio mask)语音增强数据集 20 | 数据集为语句级别,带噪语音和它相应纯净语音的频谱尺寸相同 21 | 22 | Steps: 23 | 1. 加载纯净语音信号 24 | 2. 加载噪声文件 25 | 3. 在纯净语音信号上叠加噪声信号 26 | 4. 计算频谱,mask等 27 | 5. 分别存储带噪语音的频谱与 mask 28 | 29 | Args: 30 | config (dict): 配置信息 31 | random_seed (int): 随机种子 32 | dist (str): 输出结果的目录 33 | apply_normalization (bool): 是否对 mixture 语音进行规范化 34 | n_pad (int): mixture 语音中帧的拓展范围,拓展后中心帧对应 mask 中的一帧 35 | 36 | Dataset: 37 | dataset_1/ 38 | mixture.npy 39 | mask.npy 40 | ... 41 | 42 | mixture.npy is { 43 | "0001_babble_-5": (257, T * (n_pad * 2 + 1)), 44 | "0001_babble_-10": (257, T * T * (n_pad * 2 + 1)) 45 | ... 46 | } 47 | 48 | mask.npy is { 49 | "0001_babble_-5": (257, T), 50 | "0001_babble_-10": (257, T), 51 | ... 52 | } 53 | """ 54 | global clean_lps 55 | np.random.seed(random_seed) 56 | dist_dir = Path(dist) 57 | 58 | # 以遍历的方式读取 config.json 中各个数据集的配置项 59 | for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): 60 | dataset_dir = dist_dir / dataset_cfg["name"] 61 | prepare_empty_dirs([dataset_dir]) 62 | print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) 63 | 64 | # 加载纯净语音信号,存至 list 中 65 | clean_cfg = dataset_cfg["clean"] 66 | clean_speech_paths = librosa.util.find_files( 67 | directory=clean_cfg["database"], 68 | ext=clean_cfg["ext"], 69 | recurse=clean_cfg["recurse"], 70 | limit=clean_cfg["limit"], 71 | offset=clean_cfg["offset"] 72 | ) 73 | random.shuffle(clean_speech_paths) 74 | clean_ys = load_wavs( 75 | file_paths=clean_speech_paths, 76 | sr=clean_cfg["sampling_rate"], 77 | min_sampling=clean_cfg["min_sampling"], 78 | ) 79 | print("Loaded clean speeches.") 80 | 81 | # 加载噪声信号,存至 dict 中 82 | noise_cfg = dataset_cfg["noise"] 83 | noise_database_dir = Path(noise_cfg["database"]) 84 | noise_ys = {} 85 | for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): 86 | mixture, _ = librosa.load( 87 | (noise_database_dir / (noise_type + ".wav")).as_posix(), 88 | sr=noise_cfg["sampling_rate"]) 89 | noise_ys[noise_type] = mixture 90 | print("Loaded noise.") 91 | 92 | # 合成带噪语音 93 | mixture_store = {} 94 | mask_store = {} 95 | for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): 96 | num = str(i).zfill(4) 97 | for snr in dataset_cfg["snr"]: 98 | for noise_type in noise_ys.keys(): 99 | basename_text = f"{num}_{noise_type}_{snr}" 100 | 101 | clean, noise = corrected_the_length_of_noise_and_clean_speech( 102 | clean_y=clean, 103 | noise_y=noise_ys[noise_type] 104 | ) 105 | 106 | mixture = add_noise_for_waveform(clean, noise, int(snr)) 107 | 108 | mixture_mag = mag(mixture) 109 | clean_mag = mag(clean) 110 | noise_mag = mag(noise) 111 | 112 | if apply_normalization: 113 | mixture_mag = input_normalization(mixture_mag) 114 | 115 | mixture_mag = unfold_spectrum(mixture_mag, n_pad=n_pad) 116 | mask = noise_mag / (noise_mag + clean_mag) 117 | 118 | assert mixture_mag.shape[0] == mask.shape[0] == 257 119 | mixture_store[basename_text] = mixture_mag 120 | mask_store[basename_text] = mask 121 | 122 | 123 | print(f"Synthesize finished,storing file...") 124 | joblib.dump(mask_store, (dataset_dir / "mask.pkl").as_posix()) 125 | joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix()) 126 | 127 | 128 | if __name__ == "__main__": 129 | parser = argparse.ArgumentParser(description="合成频域带噪语音") 130 | parser.add_argument("-C", "--config", required=True, type=str, help="配置文件") 131 | parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子") 132 | parser.add_argument("-A", "--apply_normalization", action="store_true", help="对输入应用规范化,即减去均值除以标准差") 133 | parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录") 134 | parser.add_argument("-P", "--n_pad", default=3, type=int, help="带噪语音需要拓展的大小") 135 | args = parser.parse_args() 136 | 137 | config = json.load(open(args.config)) 138 | main(config, args.random_seed, args.dist, args.apply_normalization, args.n_pad) 139 | -------------------------------------------------------------------------------- /time_domain.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | from pathlib import Path 5 | 6 | import joblib 7 | import librosa 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | from utils.utils import (add_noise_for_waveform, 12 | corrected_the_length_of_noise_and_clean_speech, 13 | load_wavs, prepare_empty_dirs) 14 | 15 | 16 | def main(config, random_seed, dist): 17 | """ 18 | 构建时域上的语音增强数据集 19 | 20 | Steps: 21 | 1. 加载纯净语音信号 22 | 2. 加载噪声文件 23 | 3. 在纯净语音信号上叠加噪声信号 24 | 4. 分别存储带噪语音与纯净语音 25 | 26 | Args: 27 | config (dict): 配置信息 28 | random_seed (int): 随机种子 29 | dist (str): 输出结果的目录 30 | 31 | Dataset: 32 | dataset_1/ 33 | mixture.npy 34 | clean.npy 35 | ... 36 | 37 | mixture.npy is { 38 | "0001_babble_-5": [signals, ...], 39 | "0001_babble_-10": [signals, ...], 40 | ... 41 | } 42 | 43 | clean.npy is { 44 | "0001": [signals, ...], 45 | "0002": [signals, ...], 46 | ... 47 | } 48 | """ 49 | np.random.seed(random_seed) 50 | dist_dir = Path(dist) 51 | 52 | # 以遍历的方式读取 config.json 中各个数据集的配置项 53 | for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): 54 | dataset_dir = dist_dir / dataset_cfg["name"] 55 | prepare_empty_dirs([dataset_dir]) 56 | print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) 57 | 58 | # 加载纯净语音信号,存至 list 中 59 | clean_cfg = dataset_cfg["clean"] 60 | clean_speech_paths = librosa.util.find_files( 61 | directory=clean_cfg["database"], 62 | ext=clean_cfg["ext"], 63 | recurse=clean_cfg["recurse"], 64 | limit=clean_cfg["limit"], 65 | offset=clean_cfg["offset"] 66 | ) 67 | random.shuffle(clean_speech_paths) 68 | clean_ys = load_wavs( 69 | file_paths=clean_speech_paths, 70 | sr=clean_cfg["sampling_rate"], 71 | min_sampling=clean_cfg["min_sampling"], 72 | ) 73 | print("Loaded clean speeches.") 74 | 75 | # 加载噪声信号,存至 dict 中 76 | noise_cfg = dataset_cfg["noise"] 77 | noise_database_dir = Path(noise_cfg["database"]) 78 | noise_ys = {} 79 | for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): 80 | mixture, _ = librosa.load( 81 | (noise_database_dir / (noise_type + ".wav")).as_posix(), 82 | sr=noise_cfg["sampling_rate"]) 83 | noise_ys[noise_type] = mixture 84 | print("Loaded noise.") 85 | 86 | # 合成带噪语音 87 | mixture_store = {} 88 | clean_store = {} 89 | for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): 90 | num = str(i).zfill(4) 91 | for snr in dataset_cfg["snr"]: 92 | for noise_type in noise_ys.keys(): 93 | basename_text = f"{num}_{noise_type}_{snr}" 94 | 95 | clean, noise = corrected_the_length_of_noise_and_clean_speech( 96 | clean_y=clean, 97 | noise_y=noise_ys[noise_type] 98 | ) 99 | 100 | mixture = add_noise_for_waveform(clean, noise, int(snr)) 101 | assert len(mixture) == len(clean) == len(noise) 102 | 103 | mixture_store[basename_text] = mixture 104 | 105 | # 基于一条纯净语音可以合成多种类型的带噪语音,但仅存储一份纯净语音 106 | clean_store[num] = clean 107 | 108 | print(f"Synthesize finished,storing file...") 109 | joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix()) 110 | joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix()) 111 | 112 | if __name__ == "__main__": 113 | parser = argparse.ArgumentParser(description="合成时域带噪语音") 114 | parser.add_argument("-C", "--config", required=True, type=str, help="配置文件") 115 | parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子") 116 | parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录") 117 | args = parser.parse_args() 118 | 119 | config = json.load(open(args.config)) 120 | main(config, args.random_seed, args.dist) 121 | -------------------------------------------------------------------------------- /time_domain_wav.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import random 4 | from pathlib import Path 5 | 6 | import librosa 7 | import numpy as np 8 | from tqdm import tqdm 9 | 10 | from utils.utils import (add_noise_for_waveform, 11 | corrected_the_length_of_noise_and_clean_speech, 12 | load_wavs, prepare_empty_dirs) 13 | from itertools import permutations 14 | 15 | def main(config, random_seed, dist): 16 | """ 17 | 构建时域上的语音增强数据集 18 | 19 | Steps: 20 | 1. 加载纯净语音信号 21 | 2. 加载噪声文件 22 | 3. 在纯净语音信号上叠加噪声信号 23 | 4. 分别存储带噪语音与纯净语音 24 | 25 | Args: 26 | config (dict): 配置信息 27 | random_seed (int): 随机种子 28 | dist (str): 输出结果的目录 29 | 30 | Dataset: 31 | dataset_1/ 32 | mixture.npy 33 | clean.npy 34 | ... 35 | 36 | mixture.npy is { 37 | "0001_babble_-5": [signals, ...], 38 | "0001_babble_-10": [signals, ...], 39 | ... 40 | } 41 | 42 | clean.npy is { 43 | "0001": [signals, ...], 44 | "0002": [signals, ...], 45 | ... 46 | } 47 | """ 48 | np.random.seed(random_seed) 49 | dist_dir = Path(dist) 50 | 51 | # 以遍历的方式读取 config.json 中各个数据集的配置项 52 | for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1): 53 | dataset_dir = dist_dir / dataset_cfg["name"] 54 | prepare_empty_dirs([dataset_dir, dataset_dir / "Clean", dataset_dir / "Noisy"]) 55 | print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12) 56 | 57 | # 加载纯净语音信号,存至 list 中 58 | clean_cfg = dataset_cfg["clean"] 59 | clean_speech_paths = librosa.util.find_files( 60 | directory=clean_cfg["database"], 61 | ext=clean_cfg["ext"], 62 | recurse=clean_cfg["recurse"], 63 | limit=clean_cfg["limit"], 64 | offset=clean_cfg["offset"] 65 | ) 66 | random.shuffle(clean_speech_paths) 67 | clean_ys = load_wavs( 68 | file_paths=clean_speech_paths, 69 | sr=clean_cfg["sampling_rate"], 70 | min_sampling=clean_cfg["min_sampling"], 71 | ) 72 | print("Loaded clean speeches.") 73 | 74 | # 加载噪声信号,存至 dict 中 75 | noise_cfg = dataset_cfg["noise"] 76 | noise_database_dir = Path(noise_cfg["database"]) 77 | noise_ys = {} 78 | for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"): 79 | mixture, _ = librosa.load( 80 | (noise_database_dir / (noise_type + ".wav")).as_posix(), 81 | sr=noise_cfg["sampling_rate"]) 82 | noise_ys[noise_type] = mixture 83 | print("Loaded noise.") 84 | 85 | # 合成带噪语音 86 | n = 0 87 | for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"): 88 | for snr in dataset_cfg["snr"]: 89 | for noise_type in noise_ys.keys(): 90 | clean, noise = corrected_the_length_of_noise_and_clean_speech( 91 | clean_y=clean, 92 | noise_y=noise_ys[noise_type] 93 | ) 94 | 95 | mixture = add_noise_for_waveform(clean, noise, int(snr)) 96 | assert len(mixture) == len(clean) == len(noise) 97 | 98 | fname = f"{dataset_cfg['name']}_{n}.wav" 99 | librosa.output.write_wav((dataset_dir / "Clean" / fname).as_posix(), clean, sr=16000) 100 | librosa.output.write_wav((dataset_dir / "Noisy" / fname).as_posix(), mixture, sr=16000) 101 | n += 1 102 | 103 | if __name__ == "__main__": 104 | parser = argparse.ArgumentParser(description="合成时域带噪语音") 105 | parser.add_argument("-C", "--config", required=True, type=str, help="配置文件") 106 | parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子") 107 | parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录") 108 | args = parser.parse_args() 109 | 110 | config = json.load(open(args.config)) 111 | main(config, args.random_seed, args.dist) 112 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haoxiangsnr/Build-SE-Dataset/d3bace24f5e14e8fb58428174555f612d32393d4/utils/__init__.py -------------------------------------------------------------------------------- /utils/extract_features.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | 4 | 5 | def lps(y, pad=0): 6 | """ 7 | 提取 Log Power Spectrum,仅支持 sr=16000 的波形 8 | 9 | Args: 10 | y: 信号 11 | pad: 左右填充 12 | 13 | Returns: 14 | lps: (257, T) 15 | """ 16 | D = librosa.stft(y, n_fft=512, hop_length=256, window='hamming') 17 | lps = np.log(np.power(np.abs(D), 2)) 18 | if pad != 0: 19 | lps = np.concatenate((np.zeros((257, pad)), lps, np.zeros((257, pad))), axis=1) 20 | return lps 21 | 22 | def mag(y): 23 | D = librosa.stft(y, n_fft=512, hop_length=256, window='hamming') 24 | return np.abs(D) 25 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import librosa 4 | import numpy as np 5 | import torch 6 | import soundfile 7 | from tqdm import tqdm 8 | import time 9 | 10 | def corrected_the_length_of_noise_and_clean_speech(clean_y, noise_y): 11 | """ 12 | 合成带噪语音前的长度矫正,使 len(clean_y) == len(noise_y) 13 | """ 14 | if len(clean_y) < len(noise_y): 15 | # 大多数情况,噪声比语音长 16 | return clean_y, noise_y[:len(clean_y)] 17 | elif len(clean_y) > len(noise_y): 18 | # 极少数情况,噪声比纯净语音短。此时需要将噪声重复多次,直到可以达到纯净语音的长度 19 | pad_factor = (len(clean_y) // len(noise_y)) # 拓展系数为需要拓展的次数,不包括原有的 20 | padded_noise_y = noise_y 21 | for i in range(pad_factor): 22 | padded_noise_y = np.concatenate((padded_noise_y, noise_y)) 23 | noise_y = padded_noise_y 24 | return clean_y, noise_y[:len(clean_y)] 25 | else: 26 | return clean_y, noise_y 27 | 28 | def get_name_and_ext(path): 29 | name, ext = os.path.splitext(os.path.basename(path)) 30 | return name, ext 31 | 32 | 33 | def load_noises(noise_wav_paths): 34 | """ 35 | 根据噪声列表加载噪声 36 | Args: 37 | noise_wav_paths (list): 噪声文件的路径列表 38 | 39 | Returns: 40 | dict: {"babble": [signals]} 41 | """ 42 | out = {} 43 | for noise_path in tqdm(noise_wav_paths, desc="Loading noises: "): 44 | name, _ = get_name_and_ext(noise_path) 45 | wav, _ = librosa.load(noise_path, sr=16000) 46 | out[name] = wav 47 | 48 | return out 49 | 50 | 51 | def input_normalization(m): 52 | mean = np.mean(m, axis=0) 53 | std_var = np.std(m, axis=0) 54 | return (m - mean) / std_var 55 | 56 | def add_noise_for_waveform(s, n, db): 57 | """ 58 | 为语音文件叠加噪声 59 | ---- 60 | para: 61 | s:原语音的时域信号 62 | n:噪声的时域信号 63 | db:信噪比 64 | ---- 65 | return: 66 | 叠加噪声后的语音 67 | """ 68 | alpha = np.sqrt( 69 | np.sum(s ** 2) / (np.sum(n ** 2) * 10 ** (db / 10)) 70 | ) 71 | mix = s + alpha * n 72 | return mix 73 | 74 | 75 | def prepare_empty_dirs(dirs: list): 76 | """ 77 | 建立空目录。若已经存在,则删除后创建。 78 | parents=True 79 | 80 | Args: 81 | dirs: Path list 82 | 83 | Returns: 84 | dirs 中各个目录的句柄 85 | """ 86 | result = [] 87 | for d in dirs: 88 | if d.exists(): 89 | shutil.rmtree(d.as_posix()) 90 | d.mkdir(parents=True, exist_ok=False) 91 | result.append(d) 92 | return result 93 | 94 | 95 | def load_wavs(file_paths, sr=16000, min_sampling=0): 96 | """ 97 | 根据 file_paths 逐个加载 wav 文件 98 | 99 | 可以指定: 100 | - wav 文件需要满足的最小采样点数 101 | - 需要加载的 wav 文件数量,直到遍历完整个 list 或 满足了 limit 指定的数量要求 102 | 103 | Args: 104 | file_paths: 候选集合,其中采样点数大于 minimum_sampling 的 wav 才能被加载成功 105 | limit: 要求加载的数量上限 106 | sr: 采样率 107 | min_sampling: 最小采样点数 108 | """ 109 | wavs = [] 110 | actual_num = 0 111 | 112 | for i, path in tqdm(enumerate(file_paths), desc="Loading wavs ..."): 113 | wav, _ = librosa.load(path, sr=sr) 114 | if len(wav) >= min_sampling: 115 | wavs.append(wav) 116 | actual_num += 1 117 | else: 118 | print(f"The length of {file_paths[i]} < min sampling ...") 119 | 120 | print(f"需加载 wav 文件数量为:{len(file_paths)}") 121 | print(f"实际加载 wav 文件数量为:{actual_num}") 122 | return wavs 123 | 124 | def unfold_spectrum(spec, n_pad=3): 125 | """ 126 | 对频谱应用滑窗操作 127 | 128 | Args: 129 | spec (np.array): 频谱,(n_fft, T) 130 | n_pad (int): 输入帧 pad 的大小 (default: 3,即左边 3 帧,右边也是 3 帧) 131 | 132 | Returns: 133 | np.array -- 拓展过频谱,尺寸为 (n_fft, T * (n_pad * 2 + 1)) 134 | """ 135 | # 补齐频谱左侧后右侧 136 | left_pad_spec = np.repeat(spec[:, 0].reshape(-1, 1), n_pad, axis=1) # (257, 3) 137 | right_pad_spec = np.repeat(spec[:, -1].reshape(-1, 1), n_pad, axis=1) # (257, 3) 138 | assert left_pad_spec.shape[-1] == right_pad_spec.shape[-1] == n_pad 139 | spec = np.concatenate([left_pad_spec, spec, right_pad_spec], axis=1).T # (120, 257) 140 | spec = torch.Tensor(spec) 141 | 142 | # 类似于滑窗的效果,窗大小为 2*n_pad+1,每次滑动的间隔为 1 143 | spec_list = spec.unfold(0, 2 * n_pad + 1, 1) # [tensor(257, 7), tensor(257, 7), ...], len = 114 144 | spec = torch.cat(tuple(spec_list), dim=1).numpy() # (257, 798) 145 | 146 | return spec 147 | --------------------------------------------------------------------------------