├── .gitignore
├── LICENSE
├── README.md
├── config.json
├── config_examples
    ├── config_metricGAN.json
    ├── config_metricGAN_low_snr.json
    ├── config_pix2pix_test.json
    ├── config_pix2pix_train.json
    └── config_unet_gan.json
├── frequency_domain_0.py
├── frequency_domain_1.py
├── mask_0.py
├── time_domain.py
├── time_domain_wav.py
└── utils
    ├── __init__.py
    ├── extract_features.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | !data/data/.gitkeep
  2 | dist*/
  3 | .ipynb_checkpoints/
  4 | .vscode
  5 | data
  6 | snippet
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 郝翔
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Build Speech Enhancement Dataset
 2 | 
 3 | Build speech enhancement dataset.
 4 | 
 5 | ## Dependencies
 6 | 
 7 | - tqdm
 8 | - pytorch
 9 | - librosa
10 | 
11 | ## Supported features
12 | 
13 | - time_domain: Speech level. The noisy waveform corresponds to the clean waveform.
14 | - time_domain_wav: Same as above, except that it will save the speech separately, instead of storing all the speech in the .pkl file.
15 | - frequency_domain_0: Speech level. The noisy spectrum corresponds to the clean spectrum, and they are the same size.
16 | - frequency_domain_1: Frame level. The noisy spectrum has multi-frames, and the clean speech is one frame. The center frame of the noisy spectrum is aligned with the frame of the clean speech.
17 | - frequency_domain_2: Frame level. The noisy spectrum is multi-frames, and the clean speech is multi-frames. They are the same numbers of frames.
18 | - mask_0: Frame level. The noisy spectrum has multi-frames, and the mask is one frame. The center frame of the noisy spectrum is aligned with the frame of the mask.
19 | 
20 | ## 使用
21 | 
22 | ```shell
23 | python [time_domain.py| time_domain_wav.py |frequency_domain_0.py|frequency_domain_1.py|mask_0.py] -C config.json
24 | ```
25 | 
26 | ## ToDo
27 | 
28 | - [x] Replace .npy for a more efficient format
29 | - [ ] Add more param with extracting spectrum
30 | - [ ] 添加 count 参数来配合 min_sampling
31 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset": [
 3 |         {
 4 |             "name": "train",
 5 |             "clean": {
 6 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN",
 7 |                 "sampling_rate": 16000,
 8 |                 "min_sampling": 16384,
 9 |                 "ext": "WAV",
10 |                 "recurse": true,
11 |                 "limit": 400,
12 |                 "offset": 0
13 |             },
14 |             "noise": {
15 |                 "database": "./data/noise",
16 |                 "sampling_rate": 16000,
17 |                 "types": [
18 |                     "destroyerops",
19 |                     "f16",
20 |                     "factoryfloor2",
21 |                     "leopard",
22 |                     "m109"
23 |                 ]
24 |             },
25 |             "snr": [
26 |                 0,
27 |                 -5
28 |             ]
29 |         },
30 |         {
31 |             "name": "test",
32 |             "clean": {
33 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TEST",
34 |                 "sampling_rate": 16000,
35 |                 "min_sampling": 16384,
36 |                 "ext": "WAV",
37 |                 "recurse": true,
38 |                 "limit": 50,
39 |                 "offset": 0
40 |             },
41 |             "noise": {
42 |                 "database": "./data/noise",
43 |                 "sampling_rate": 16000,
44 |                 "types": [
45 |                     "babble",
46 |                     "factoryfloor1",
47 |                     "destroyerengine"
48 |                 ]
49 |             },
50 |             "snr": [
51 |                 -5,
52 |                 0,
53 |                 5
54 |             ]
55 |         }
56 |     ]
57 | }
58 | 


--------------------------------------------------------------------------------
/config_examples/config_metricGAN.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset": [
 3 |         {
 4 |             "name": "Train",
 5 |             "clean": {
 6 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN",
 7 |                 "sampling_rate": 16000,
 8 |                 "min_sampling": 16384,
 9 |                 "ext": "WAV",
10 |                 "recurse": true,
11 |                 "limit": 300,
12 |                 "offset": 0
13 |             },
14 |             "noise": {
15 |                 "database": "./data/Nonspeech",
16 |                 "sampling_rate": 16000,
17 |                 "types": [
18 |                     "n1",
19 |                     "n18",
20 |                     "n20",
21 |                     "n30",
22 |                     "n46",
23 |                     "n51",
24 |                     "n56",
25 |                     "n72",
26 |                     "n79",
27 |                     "n88"
28 |                 ]
29 |             },
30 |             "snr": [
31 |                 -8,
32 |                 -4,
33 |                 -0,
34 |                 4,
35 |                 8
36 |             ]
37 |         },
38 |         {
39 |             "name": "Test",
40 |             "clean": {
41 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN",
42 |                 "sampling_rate": 16000,
43 |                 "min_sampling": 16384,
44 |                 "ext": "WAV",
45 |                 "recurse": true,
46 |                 "limit": 100,
47 |                 "offset": 400
48 |             },
49 |             "noise": {
50 |                 "database": "./data/Nonspeech",
51 |                 "sampling_rate": 16000,
52 |                 "types": [
53 |                     "n2",
54 |                     "n19",
55 |                     "n21",
56 |                     "n31",
57 |                     "n47",
58 |                     "n52",
59 |                     "n57",
60 |                     "n73",
61 |                     "n80",
62 |                     "n89"
63 |                 ]
64 |             },
65 |             "snr": [
66 |                 -10,
67 |                 -5,
68 |                 0,
69 |                 5,
70 |                 10
71 |             ]
72 |         }
73 |     ]
74 | }
75 | 


--------------------------------------------------------------------------------
/config_examples/config_metricGAN_low_snr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset": [
 3 |         {
 4 |             "name": "Train",
 5 |             "clean": {
 6 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN",
 7 |                 "sampling_rate": 16000,
 8 |                 "min_sampling": 16384,
 9 |                 "ext": "WAV",
10 |                 "recurse": true,
11 |                 "limit": 300,
12 |                 "offset": 0
13 |             },
14 |             "noise": {
15 |                 "database": "./data/Nonspeech",
16 |                 "sampling_rate": 16000,
17 |                 "types": [
18 |                     "n1",
19 |                     "n18",
20 |                     "n20",
21 |                     "n30",
22 |                     "n46",
23 |                     "n51",
24 |                     "n56",
25 |                     "n72",
26 |                     "n79",
27 |                     "n88"
28 |                 ]
29 |             },
30 |             "snr": [
31 |                 -15,
32 |                 -10,
33 |                 -5,
34 |                 0,
35 |                 5
36 |             ]
37 |         },
38 |         {
39 |             "name": "Test",
40 |             "clean": {
41 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN",
42 |                 "sampling_rate": 16000,
43 |                 "min_sampling": 16384,
44 |                 "ext": "WAV",
45 |                 "recurse": true,
46 |                 "limit": 100,
47 |                 "offset": 400
48 |             },
49 |             "noise": {
50 |                 "database": "./data/Nonspeech",
51 |                 "sampling_rate": 16000,
52 |                 "types": [
53 |                     "n2",
54 |                     "n19",
55 |                     "n21",
56 |                     "n31",
57 |                     "n47",
58 |                     "n52",
59 |                     "n57",
60 |                     "n73",
61 |                     "n80",
62 |                     "n89"
63 |                 ]
64 |             },
65 |             "snr": [
66 |                 -20,
67 |                 -15,
68 |                 -10,
69 |                 -5,
70 |                 0
71 |             ]
72 |         }
73 |     ]
74 | }
75 | 


--------------------------------------------------------------------------------
/config_examples/config_pix2pix_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset": [
 3 |         {
 4 |             "name": "test",
 5 |             "clean": {
 6 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TEST",
 7 |                 "sampling_rate": 16000,
 8 |                 "min_sampling": 65536,
 9 |                 "_comment": "stft(65536)=257,257，最后一帧是 pad 出来的，不确定好坏，先考虑丢掉",
10 |                 "ext": "WAV",
11 |                 "recurse": true,
12 |                 "limit": 500,
13 |                 "offset": 0
14 |             },
15 |             "noise": {
16 |                 "database": "./data/noise",
17 |                 "sampling_rate": 16000,
18 |                 "types": [
19 |                     "babble",
20 |                     "factoryfloor1",
21 |                     "destroyerops",
22 |                     "destroyerengine",
23 |                     "factoryfloor2"
24 |                 ]
25 |             },
26 |             "snr": [
27 |                 0,
28 |                 -3,
29 |                 -5,
30 |                 -7,
31 |                 -10,
32 |                 -12,
33 |                 -15,
34 |                 -17,
35 |                 -20
36 |             ]
37 |         }
38 |     ]
39 | }
40 | 


--------------------------------------------------------------------------------
/config_examples/config_pix2pix_train.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset": [
 3 |         {
 4 |             "name": "train",
 5 |             "clean": {
 6 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN",
 7 |                 "sampling_rate": 16000,
 8 |                 "min_sampling": 65536,
 9 |                 "_comment": "stft(65536)=257,257，最后一帧是 pad 出来的，不确定好坏，先考虑丢掉",
10 |                 "ext": "WAV",
11 |                 "recurse": true,
12 |                 "limit": 2400,
13 |                 "offset": 0
14 |             },
15 |             "noise": {
16 |                 "database": "./data/noise",
17 |                 "sampling_rate": 16000,
18 |                 "types": [
19 |                     "babble",
20 |                     "factoryfloor1",
21 |                     "destroyerops",
22 |                     "destroyerengine"
23 |                 ]
24 |             },
25 |             "snr": [
26 |                 0,
27 |                 -5,
28 |                 -10,
29 |                 -15
30 |             ]
31 |         }
32 |     ]
33 | }
34 | 


--------------------------------------------------------------------------------
/config_examples/config_unet_gan.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "dataset": [
 3 |         {
 4 |             "name": "train",
 5 |             "clean": {
 6 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TRAIN",
 7 |                 "sampling_rate": 16000,
 8 |                 "min_sampling": 16384,
 9 |                 "ext": "WAV",
10 |                 "recurse": true,
11 |                 "limit": 600,
12 |                 "offset": 0
13 |             },
14 |             "noise": {
15 |                 "database": "./data/noise",
16 |                 "sampling_rate": 16000,
17 |                 "types": [
18 |                     "babble",
19 |                     "factoryfloor1",
20 |                     "destroyerops",
21 |                     "destroyerengine"
22 |                 ]
23 |             },
24 |             "snr": [
25 |                 0,
26 |                 -5,
27 |                 -10,
28 |                 -15
29 |             ]
30 |         },
31 |         {
32 |             "name": "test",
33 |             "clean": {
34 |                 "database": "./data/clean/data/lisa/data/timit/raw/TIMIT/TEST",
35 |                 "sampling_rate": 16000,
36 |                 "min_sampling": 16384,
37 |                 "ext": "WAV",
38 |                 "recurse": true,
39 |                 "limit": 100,
40 |                 "offset": 0
41 |             },
42 |             "noise": {
43 |                 "database": "./data/noise",
44 |                 "sampling_rate": 16000,
45 |                 "types": [
46 |                     "babble",
47 |                     "factoryfloor1",
48 |                     "destroyerops",
49 |                     "destroyerengine",
50 |                     "factoryfloor2"
51 |                 ]
52 |             },
53 |             "snr": [
54 |                 0,
55 |                 -3,
56 |                 -5,
57 |                 -7,
58 |                 -10,
59 |                 -12,
60 |                 -15,
61 |                 -17,
62 |                 -20
63 |             ]
64 |         }
65 |     ]
66 | }
67 | 


--------------------------------------------------------------------------------
/frequency_domain_0.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import random
  4 | from pathlib import Path
  5 | 
  6 | import joblib
  7 | import librosa
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | from utils.extract_features import lps
 12 | from utils.utils import (add_noise_for_waveform,
 13 |                          corrected_the_length_of_noise_and_clean_speech,
 14 |                          load_wavs, prepare_empty_dirs)
 15 | 
 16 | 
 17 | def main(config, random_seed, dist):
 18 |     """
 19 |     构建*频域*上的语音增强数据集（Log Power Spectrum）
 20 |     数据集为语句级别，带噪语音和它相应纯净语音的频谱尺寸相同
 21 | 
 22 |     Steps:
 23 |         1. 加载纯净语音信号
 24 |         2. 加载噪声文件
 25 |         3. 在纯净语音信号上叠加噪声信号
 26 |         4. 分别计算 LPS 特征
 27 |         5. 分别存储带噪语音与纯净语音
 28 | 
 29 |     Args:
 30 |         config (dict): 配置信息
 31 |         random_seed (int): 随机种子
 32 |         dist (str): 输出结果的目录
 33 | 
 34 |     Dataset:
 35 |         dataset_1/
 36 |             mixture.npy
 37 |             clean.npy
 38 |         ...
 39 | 
 40 |         mixture.npy is {
 41 |             "0001_babble_-5": (257, T),
 42 |             "0001_babble_-10": (257, T),
 43 |             ...
 44 |         }
 45 | 
 46 |         clean.npy is {
 47 |             "0001": (257, T),
 48 |             "0002": (257, T),
 49 |             ...
 50 |         }
 51 |     """
 52 |     global clean_lps
 53 |     np.random.seed(random_seed)
 54 |     dist_dir = Path(dist)
 55 | 
 56 |     # 以遍历的方式读取 config.json 中各个数据集的配置项
 57 |     for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
 58 |         dataset_dir = dist_dir / dataset_cfg["name"]
 59 |         prepare_empty_dirs([dataset_dir])
 60 |         print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12)
 61 | 
 62 |         # 加载纯净语音信号，存至 list 中
 63 |         clean_cfg = dataset_cfg["clean"]
 64 |         clean_speech_paths = librosa.util.find_files(
 65 |             directory=clean_cfg["database"],
 66 |             ext=clean_cfg["ext"],
 67 |             recurse=clean_cfg["recurse"],
 68 |             limit=clean_cfg["limit"],
 69 |             offset=clean_cfg["offset"]
 70 |         )
 71 |         random.shuffle(clean_speech_paths)
 72 |         clean_ys = load_wavs(
 73 |             file_paths=clean_speech_paths,
 74 |             sr=clean_cfg["sampling_rate"],
 75 |             min_sampling=clean_cfg["min_sampling"],
 76 |         )
 77 |         print("Loaded clean speeches.")
 78 | 
 79 |         # 加载噪声信号，存至 dict 中
 80 |         noise_cfg = dataset_cfg["noise"]
 81 |         noise_database_dir = Path(noise_cfg["database"])
 82 |         noise_ys = {}
 83 |         for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
 84 |             mixture, _ = librosa.load(
 85 |                 (noise_database_dir / (noise_type + ".wav")).as_posix(),
 86 |                 sr=noise_cfg["sampling_rate"])
 87 |             noise_ys[noise_type] = mixture
 88 |         print("Loaded noise.")
 89 | 
 90 |         # 合成带噪语音
 91 |         mixture_store = {}
 92 |         clean_store = {}
 93 |         for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
 94 |             num = str(i).zfill(4)
 95 |             for snr in dataset_cfg["snr"]:
 96 |                 for noise_type in noise_ys.keys():
 97 |                     basename_text = f"{num}_{noise_type}_{snr}"
 98 | 
 99 |                     clean, noise = corrected_the_length_of_noise_and_clean_speech(
100 |                         clean_y=clean,
101 |                         noise_y=noise_ys[noise_type]
102 |                     )
103 | 
104 |                     mixture = add_noise_for_waveform(clean, noise, int(snr))
105 |                     assert len(mixture) == len(clean) == len(noise)
106 | 
107 |                     mixture_lps = lps(mixture)
108 |                     clean_lps = lps(clean)
109 | 
110 |                     assert mixture_lps.shape[0] == clean_lps.shape[0] == 257
111 |                     mixture_store[basename_text] = mixture_lps
112 | 
113 |             clean_store[num] = clean_lps
114 | 
115 |         print(f"Synthesize finished，storing file...")
116 |         joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix())
117 |         joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
118 | 
119 | if __name__ == "__main__":
120 |     parser = argparse.ArgumentParser(description="合成频域带噪语音")
121 |     parser.add_argument("-C", "--config", required=True, type=str, help="配置文件")
122 |     parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子")
123 |     parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录")
124 |     args = parser.parse_args()
125 | 
126 |     config = json.load(open(args.config))
127 |     main(config, args.random_seed, args.dist)
128 | 


--------------------------------------------------------------------------------
/frequency_domain_1.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import random
  4 | from pathlib import Path
  5 | 
  6 | import joblib
  7 | import librosa
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | from utils.extract_features import lps
 12 | from utils.utils import (add_noise_for_waveform,
 13 |                          corrected_the_length_of_noise_and_clean_speech,
 14 |                          load_wavs, prepare_empty_dirs, unfold_spectrum)
 15 | 
 16 | 
 17 | def main(config, random_seed, dist, n_pad):
 18 |     """
 19 |     构建*频域*上的语音增强数据集（Log Power Spectrum）
 20 |     每句带噪语音的时间步上都包含多帧，多帧的中心帧对应这个时间步上的一帧纯净语音
 21 |     中心帧前面的时间帧：
 22 |     中心帧后面的时间帧：
 23 |     TODO 文档等待进一步更新
 24 | 
 25 |     Steps:
 26 |         1. 加载纯净语音信号
 27 |         2. 加载噪声文件
 28 |         3. 在纯净语音信号上叠加噪声信号
 29 |         4. 分别计算 LPS 特征
 30 |         5. 将带噪语音的 LPS 特征进行拓展
 31 |         5. 分别存储带噪语音与纯净语音
 32 | 
 33 |     Args:
 34 |         config (dict): 配置信息
 35 |         random_seed (int): 随机种子
 36 |         dist (str): 输出结果的目录
 37 |         n_pad (int): 带噪语音的拓展大小
 38 | 
 39 |     Dataset:
 40 |         dataset_1/
 41 |             mixture.npy
 42 |             clean.npy
 43 |         ...
 44 | 
 45 |         mixture.npy is {
 46 |             "0001_babble_-5": (257 * 3 * , T),
 47 |             "0001_babble_-10": (257 * 3, T),
 48 |             ...
 49 |         }
 50 | 
 51 |         clean.npy is {
 52 |             "0001": (257, T),
 53 |             "0002": (257, T),
 54 |             ...
 55 |         }
 56 |     """
 57 |     global clean_lps
 58 |     np.random.seed(random_seed)
 59 |     dist_dir = Path(dist)
 60 | 
 61 |     # 以遍历的方式读取 config.json 中各个数据集的配置项
 62 |     for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
 63 |         dataset_dir = dist_dir / dataset_cfg["name"]
 64 |         prepare_empty_dirs([dataset_dir])
 65 |         print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12)
 66 | 
 67 |         # 加载纯净语音信号，存至 list 中
 68 |         clean_cfg = dataset_cfg["clean"]
 69 |         clean_speech_paths = librosa.util.find_files(
 70 |             directory=clean_cfg["database"],
 71 |             ext=clean_cfg["ext"],
 72 |             recurse=clean_cfg["recurse"],
 73 |             limit=clean_cfg["limit"],
 74 |             offset=clean_cfg["offset"]
 75 |         )
 76 |         random.shuffle(clean_speech_paths)
 77 |         clean_ys = load_wavs(
 78 |             file_paths=clean_speech_paths,
 79 |             sr=clean_cfg["sampling_rate"],
 80 |             min_sampling=clean_cfg["min_sampling"],
 81 |         )
 82 |         print("Loaded clean speeches.")
 83 | 
 84 |         # 加载噪声信号，存至 dict 中
 85 |         noise_cfg = dataset_cfg["noise"]
 86 |         noise_database_dir = Path(noise_cfg["database"])
 87 |         noise_ys = {}
 88 |         for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
 89 |             mixture, _ = librosa.load(
 90 |                 (noise_database_dir / (noise_type + ".wav")).as_posix(),
 91 |                 sr=noise_cfg["sampling_rate"])
 92 |             noise_ys[noise_type] = mixture
 93 |         print("Loaded noise.")
 94 | 
 95 |         # 合成带噪语音
 96 |         mixture_store = {}
 97 |         clean_store = {}
 98 |         for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
 99 |             num = str(i).zfill(4)
100 |             for snr in dataset_cfg["snr"]:
101 |                 for noise_type in noise_ys.keys():
102 |                     basename_text = f"{num}_{noise_type}_{snr}"
103 | 
104 |                     clean, noise = corrected_the_length_of_noise_and_clean_speech(
105 |                         clean_y=clean,
106 |                         noise_y=noise_ys[noise_type]
107 |                     )
108 | 
109 |                     mixture = add_noise_for_waveform(clean, noise, int(snr))
110 |                     assert len(mixture) == len(clean) == len(noise)
111 | 
112 |                     mixture_lps = lps(mixture)
113 |                     clean_lps = lps(clean)
114 |                     mixture_lps = unfold_spectrum(mixture_lps, n_pad=n_pad)
115 | 
116 |                     assert mixture_lps.shape[0] == clean_lps.shape[0] == 257
117 |                     mixture_store[basename_text] = mixture_lps
118 | 
119 |             clean_store[num] = clean_lps
120 | 
121 |         print(f"Synthesize finished，storing file...")
122 |         joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix())
123 |         joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     parser = argparse.ArgumentParser(description="合成频域带噪语音（可拓展帧）")
128 |     parser.add_argument("-C", "--config", required=True, type=str, help="配置文件")
129 |     parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子")
130 |     parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录")
131 |     parser.add_argument("-P", "--n_pad", default=3, type=int, help="带噪语音需要拓展的大小")
132 |     args = parser.parse_args()
133 | 
134 |     config = json.load(open(args.config))
135 |     main(config, args.random_seed, args.dist, args.n_pad)
136 | 


--------------------------------------------------------------------------------
/mask_0.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import random
  4 | from pathlib import Path
  5 | 
  6 | import joblib
  7 | import librosa
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | from utils.extract_features import mag
 12 | from utils.utils import (add_noise_for_waveform,
 13 |                          corrected_the_length_of_noise_and_clean_speech,
 14 |                          load_wavs, prepare_empty_dirs, input_normalization, unfold_spectrum)
 15 | 
 16 | 
 17 | def main(config, random_seed, dist, apply_normalization, n_pad):
 18 |     """
 19 |     构建 IRM（Ideal ratio mask）语音增强数据集
 20 |     数据集为语句级别，带噪语音和它相应纯净语音的频谱尺寸相同
 21 | 
 22 |     Steps:
 23 |         1. 加载纯净语音信号
 24 |         2. 加载噪声文件
 25 |         3. 在纯净语音信号上叠加噪声信号
 26 |         4. 计算频谱，mask等
 27 |         5. 分别存储带噪语音的频谱与 mask
 28 | 
 29 |     Args:
 30 |         config (dict): 配置信息
 31 |         random_seed (int): 随机种子
 32 |         dist (str): 输出结果的目录
 33 |         apply_normalization (bool): 是否对 mixture 语音进行规范化
 34 |         n_pad (int): mixture 语音中帧的拓展范围，拓展后中心帧对应 mask 中的一帧
 35 | 
 36 |     Dataset:
 37 |         dataset_1/
 38 |             mixture.npy
 39 |             mask.npy
 40 |         ...
 41 | 
 42 |         mixture.npy is {
 43 |             "0001_babble_-5": (257, T * (n_pad * 2 + 1)),
 44 |             "0001_babble_-10": (257, T * T * (n_pad * 2 + 1))
 45 |             ...
 46 |         }
 47 | 
 48 |         mask.npy is {
 49 |             "0001_babble_-5": (257, T),
 50 |             "0001_babble_-10": (257, T),
 51 |             ...
 52 |         }
 53 |     """
 54 |     global clean_lps
 55 |     np.random.seed(random_seed)
 56 |     dist_dir = Path(dist)
 57 | 
 58 |     # 以遍历的方式读取 config.json 中各个数据集的配置项
 59 |     for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
 60 |         dataset_dir = dist_dir / dataset_cfg["name"]
 61 |         prepare_empty_dirs([dataset_dir])
 62 |         print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12)
 63 | 
 64 |         # 加载纯净语音信号，存至 list 中
 65 |         clean_cfg = dataset_cfg["clean"]
 66 |         clean_speech_paths = librosa.util.find_files(
 67 |             directory=clean_cfg["database"],
 68 |             ext=clean_cfg["ext"],
 69 |             recurse=clean_cfg["recurse"],
 70 |             limit=clean_cfg["limit"],
 71 |             offset=clean_cfg["offset"]
 72 |         )
 73 |         random.shuffle(clean_speech_paths)
 74 |         clean_ys = load_wavs(
 75 |             file_paths=clean_speech_paths,
 76 |             sr=clean_cfg["sampling_rate"],
 77 |             min_sampling=clean_cfg["min_sampling"],
 78 |         )
 79 |         print("Loaded clean speeches.")
 80 | 
 81 |         # 加载噪声信号，存至 dict 中
 82 |         noise_cfg = dataset_cfg["noise"]
 83 |         noise_database_dir = Path(noise_cfg["database"])
 84 |         noise_ys = {}
 85 |         for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
 86 |             mixture, _ = librosa.load(
 87 |                 (noise_database_dir / (noise_type + ".wav")).as_posix(),
 88 |                 sr=noise_cfg["sampling_rate"])
 89 |             noise_ys[noise_type] = mixture
 90 |         print("Loaded noise.")
 91 | 
 92 |         # 合成带噪语音
 93 |         mixture_store = {}
 94 |         mask_store = {}
 95 |         for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
 96 |             num = str(i).zfill(4)
 97 |             for snr in dataset_cfg["snr"]:
 98 |                 for noise_type in noise_ys.keys():
 99 |                     basename_text = f"{num}_{noise_type}_{snr}"
100 | 
101 |                     clean, noise = corrected_the_length_of_noise_and_clean_speech(
102 |                         clean_y=clean,
103 |                         noise_y=noise_ys[noise_type]
104 |                     )
105 | 
106 |                     mixture = add_noise_for_waveform(clean, noise, int(snr))
107 | 
108 |                     mixture_mag = mag(mixture)
109 |                     clean_mag = mag(clean)
110 |                     noise_mag = mag(noise)
111 | 
112 |                     if apply_normalization:
113 |                         mixture_mag = input_normalization(mixture_mag)
114 | 
115 |                     mixture_mag = unfold_spectrum(mixture_mag, n_pad=n_pad)
116 |                     mask = noise_mag / (noise_mag + clean_mag)
117 | 
118 |                     assert mixture_mag.shape[0] == mask.shape[0] == 257
119 |                     mixture_store[basename_text] = mixture_mag
120 |                     mask_store[basename_text] = mask
121 | 
122 | 
123 |         print(f"Synthesize finished，storing file...")
124 |         joblib.dump(mask_store, (dataset_dir / "mask.pkl").as_posix())
125 |         joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     parser = argparse.ArgumentParser(description="合成频域带噪语音")
130 |     parser.add_argument("-C", "--config", required=True, type=str, help="配置文件")
131 |     parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子")
132 |     parser.add_argument("-A", "--apply_normalization", action="store_true", help="对输入应用规范化，即减去均值除以标准差")
133 |     parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录")
134 |     parser.add_argument("-P", "--n_pad", default=3, type=int, help="带噪语音需要拓展的大小")
135 |     args = parser.parse_args()
136 | 
137 |     config = json.load(open(args.config))
138 |     main(config, args.random_seed, args.dist, args.apply_normalization, args.n_pad)
139 | 


--------------------------------------------------------------------------------
/time_domain.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import random
  4 | from pathlib import Path
  5 | 
  6 | import joblib
  7 | import librosa
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | from utils.utils import (add_noise_for_waveform,
 12 |                          corrected_the_length_of_noise_and_clean_speech,
 13 |                          load_wavs, prepare_empty_dirs)
 14 | 
 15 | 
 16 | def main(config, random_seed, dist):
 17 |     """
 18 |     构建时域上的语音增强数据集
 19 | 
 20 |     Steps:
 21 |         1. 加载纯净语音信号
 22 |         2. 加载噪声文件
 23 |         3. 在纯净语音信号上叠加噪声信号
 24 |         4. 分别存储带噪语音与纯净语音
 25 | 
 26 |     Args:
 27 |         config (dict): 配置信息
 28 |         random_seed (int): 随机种子
 29 |         dist (str): 输出结果的目录
 30 | 
 31 |     Dataset:
 32 |         dataset_1/
 33 |             mixture.npy
 34 |             clean.npy
 35 |         ...
 36 | 
 37 |         mixture.npy is {
 38 |             "0001_babble_-5": [signals, ...],
 39 |             "0001_babble_-10": [signals, ...],
 40 |             ...
 41 |         }
 42 | 
 43 |         clean.npy is {
 44 |             "0001": [signals, ...],
 45 |             "0002": [signals, ...],
 46 |             ...
 47 |         }
 48 |     """
 49 |     np.random.seed(random_seed)
 50 |     dist_dir = Path(dist)
 51 | 
 52 |     # 以遍历的方式读取 config.json 中各个数据集的配置项
 53 |     for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
 54 |         dataset_dir = dist_dir / dataset_cfg["name"]
 55 |         prepare_empty_dirs([dataset_dir])
 56 |         print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12)
 57 | 
 58 |         # 加载纯净语音信号，存至 list 中
 59 |         clean_cfg = dataset_cfg["clean"]
 60 |         clean_speech_paths = librosa.util.find_files(
 61 |             directory=clean_cfg["database"],
 62 |             ext=clean_cfg["ext"],
 63 |             recurse=clean_cfg["recurse"],
 64 |             limit=clean_cfg["limit"],
 65 |             offset=clean_cfg["offset"]
 66 |         )
 67 |         random.shuffle(clean_speech_paths)
 68 |         clean_ys = load_wavs(
 69 |             file_paths=clean_speech_paths,
 70 |             sr=clean_cfg["sampling_rate"],
 71 |             min_sampling=clean_cfg["min_sampling"],
 72 |         )
 73 |         print("Loaded clean speeches.")
 74 | 
 75 |         # 加载噪声信号，存至 dict 中
 76 |         noise_cfg = dataset_cfg["noise"]
 77 |         noise_database_dir = Path(noise_cfg["database"])
 78 |         noise_ys = {}
 79 |         for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
 80 |             mixture, _ = librosa.load(
 81 |                 (noise_database_dir / (noise_type + ".wav")).as_posix(),
 82 |                 sr=noise_cfg["sampling_rate"])
 83 |             noise_ys[noise_type] = mixture
 84 |         print("Loaded noise.")
 85 | 
 86 |         # 合成带噪语音
 87 |         mixture_store = {}
 88 |         clean_store = {}
 89 |         for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
 90 |             num = str(i).zfill(4)
 91 |             for snr in dataset_cfg["snr"]:
 92 |                 for noise_type in noise_ys.keys():
 93 |                     basename_text = f"{num}_{noise_type}_{snr}"
 94 | 
 95 |                     clean, noise = corrected_the_length_of_noise_and_clean_speech(
 96 |                         clean_y=clean,
 97 |                         noise_y=noise_ys[noise_type]
 98 |                     )
 99 | 
100 |                     mixture = add_noise_for_waveform(clean, noise, int(snr))
101 |                     assert len(mixture) == len(clean) == len(noise)
102 | 
103 |                     mixture_store[basename_text] = mixture
104 | 
105 |             # 基于一条纯净语音可以合成多种类型的带噪语音，但仅存储一份纯净语音
106 |             clean_store[num] = clean
107 | 
108 |         print(f"Synthesize finished，storing file...")
109 |         joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix())
110 |         joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
111 | 
112 | if __name__ == "__main__":
113 |     parser = argparse.ArgumentParser(description="合成时域带噪语音")
114 |     parser.add_argument("-C", "--config", required=True, type=str, help="配置文件")
115 |     parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子")
116 |     parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录")
117 |     args = parser.parse_args()
118 | 
119 |     config = json.load(open(args.config))
120 |     main(config, args.random_seed, args.dist)
121 | 


--------------------------------------------------------------------------------
/time_domain_wav.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import random
  4 | from pathlib import Path
  5 | 
  6 | import librosa
  7 | import numpy as np
  8 | from tqdm import tqdm
  9 | 
 10 | from utils.utils import (add_noise_for_waveform,
 11 |                          corrected_the_length_of_noise_and_clean_speech,
 12 |                          load_wavs, prepare_empty_dirs)
 13 | from itertools import permutations
 14 | 
 15 | def main(config, random_seed, dist):
 16 |     """
 17 |     构建时域上的语音增强数据集
 18 | 
 19 |     Steps:
 20 |         1. 加载纯净语音信号
 21 |         2. 加载噪声文件
 22 |         3. 在纯净语音信号上叠加噪声信号
 23 |         4. 分别存储带噪语音与纯净语音
 24 | 
 25 |     Args:
 26 |         config (dict): 配置信息
 27 |         random_seed (int): 随机种子
 28 |         dist (str): 输出结果的目录
 29 | 
 30 |     Dataset:
 31 |         dataset_1/
 32 |             mixture.npy
 33 |             clean.npy
 34 |         ...
 35 | 
 36 |         mixture.npy is {
 37 |             "0001_babble_-5": [signals, ...],
 38 |             "0001_babble_-10": [signals, ...],
 39 |             ...
 40 |         }
 41 | 
 42 |         clean.npy is {
 43 |             "0001": [signals, ...],
 44 |             "0002": [signals, ...],
 45 |             ...
 46 |         }
 47 |     """
 48 |     np.random.seed(random_seed)
 49 |     dist_dir = Path(dist)
 50 | 
 51 |     # 以遍历的方式读取 config.json 中各个数据集的配置项
 52 |     for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
 53 |         dataset_dir = dist_dir / dataset_cfg["name"]
 54 |         prepare_empty_dirs([dataset_dir, dataset_dir / "Clean", dataset_dir / "Noisy"])
 55 |         print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12)
 56 | 
 57 |         # 加载纯净语音信号，存至 list 中
 58 |         clean_cfg = dataset_cfg["clean"]
 59 |         clean_speech_paths = librosa.util.find_files(
 60 |             directory=clean_cfg["database"],
 61 |             ext=clean_cfg["ext"],
 62 |             recurse=clean_cfg["recurse"],
 63 |             limit=clean_cfg["limit"],
 64 |             offset=clean_cfg["offset"]
 65 |         )
 66 |         random.shuffle(clean_speech_paths)
 67 |         clean_ys = load_wavs(
 68 |             file_paths=clean_speech_paths,
 69 |             sr=clean_cfg["sampling_rate"],
 70 |             min_sampling=clean_cfg["min_sampling"],
 71 |         )
 72 |         print("Loaded clean speeches.")
 73 | 
 74 |         # 加载噪声信号，存至 dict 中
 75 |         noise_cfg = dataset_cfg["noise"]
 76 |         noise_database_dir = Path(noise_cfg["database"])
 77 |         noise_ys = {}
 78 |         for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
 79 |             mixture, _ = librosa.load(
 80 |                 (noise_database_dir / (noise_type + ".wav")).as_posix(),
 81 |                 sr=noise_cfg["sampling_rate"])
 82 |             noise_ys[noise_type] = mixture
 83 |         print("Loaded noise.")
 84 | 
 85 |         # 合成带噪语音
 86 |         n = 0
 87 |         for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
 88 |             for snr in dataset_cfg["snr"]:
 89 |                 for noise_type in noise_ys.keys():
 90 |                     clean, noise = corrected_the_length_of_noise_and_clean_speech(
 91 |                         clean_y=clean,
 92 |                         noise_y=noise_ys[noise_type]
 93 |                     )
 94 | 
 95 |                     mixture = add_noise_for_waveform(clean, noise, int(snr))
 96 |                     assert len(mixture) == len(clean) == len(noise)
 97 | 
 98 |                     fname = f"{dataset_cfg['name']}_{n}.wav"
 99 |                     librosa.output.write_wav((dataset_dir / "Clean" / fname).as_posix(), clean, sr=16000)
100 |                     librosa.output.write_wav((dataset_dir / "Noisy" / fname).as_posix(), mixture, sr=16000)
101 |                     n += 1
102 | 
103 | if __name__ == "__main__":
104 |     parser = argparse.ArgumentParser(description="合成时域带噪语音")
105 |     parser.add_argument("-C", "--config", required=True, type=str, help="配置文件")
106 |     parser.add_argument("-S", "--random_seed", default=0, type=int, help="随机种子")
107 |     parser.add_argument("-O", "--dist", default="./dist", type=str, help="输出目录")
108 |     args = parser.parse_args()
109 | 
110 |     config = json.load(open(args.config))
111 |     main(config, args.random_seed, args.dist)
112 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/haoxiangsnr/Build-SE-Dataset/d3bace24f5e14e8fb58428174555f612d32393d4/utils/__init__.py


--------------------------------------------------------------------------------
/utils/extract_features.py:
--------------------------------------------------------------------------------
 1 | import librosa
 2 | import numpy as np
 3 | 
 4 | 
 5 | def lps(y, pad=0):
 6 |     """
 7 |     提取 Log Power Spectrum，仅支持 sr=16000 的波形
 8 | 
 9 |     Args:
10 |         y: 信号
11 |         pad: 左右填充
12 | 
13 |     Returns:
14 |         lps: (257, T)
15 |     """
16 |     D = librosa.stft(y, n_fft=512, hop_length=256, window='hamming')
17 |     lps = np.log(np.power(np.abs(D), 2))
18 |     if pad != 0:
19 |         lps = np.concatenate((np.zeros((257, pad)), lps, np.zeros((257, pad))), axis=1)
20 |     return lps
21 | 
22 | def mag(y):
23 |     D = librosa.stft(y, n_fft=512, hop_length=256, window='hamming')
24 |     return np.abs(D)
25 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import librosa
  4 | import numpy as np
  5 | import torch
  6 | import soundfile
  7 | from tqdm import tqdm
  8 | import time
  9 | 
 10 | def corrected_the_length_of_noise_and_clean_speech(clean_y, noise_y):
 11 |     """
 12 |     合成带噪语音前的长度矫正，使 len(clean_y) == len(noise_y)
 13 |     """
 14 |     if len(clean_y) < len(noise_y):
 15 |         # 大多数情况，噪声比语音长
 16 |         return clean_y, noise_y[:len(clean_y)]
 17 |     elif len(clean_y) > len(noise_y):
 18 |         # 极少数情况，噪声比纯净语音短。此时需要将噪声重复多次，直到可以达到纯净语音的长度
 19 |         pad_factor = (len(clean_y) // len(noise_y))  # 拓展系数为需要拓展的次数，不包括原有的
 20 |         padded_noise_y = noise_y
 21 |         for i in range(pad_factor):
 22 |             padded_noise_y = np.concatenate((padded_noise_y, noise_y))
 23 |         noise_y = padded_noise_y
 24 |         return clean_y, noise_y[:len(clean_y)]
 25 |     else:
 26 |         return clean_y, noise_y
 27 | 
 28 | def get_name_and_ext(path):
 29 |     name, ext = os.path.splitext(os.path.basename(path))
 30 |     return name, ext
 31 | 
 32 | 
 33 | def load_noises(noise_wav_paths):
 34 |     """
 35 |     根据噪声列表加载噪声
 36 |     Args:
 37 |         noise_wav_paths (list): 噪声文件的路径列表
 38 | 
 39 |     Returns:
 40 |         dict: {"babble": [signals]}
 41 |     """
 42 |     out = {}
 43 |     for noise_path in tqdm(noise_wav_paths, desc="Loading noises: "):
 44 |         name, _ = get_name_and_ext(noise_path)
 45 |         wav, _ = librosa.load(noise_path, sr=16000)
 46 |         out[name] = wav
 47 | 
 48 |     return out
 49 | 
 50 | 
 51 | def input_normalization(m):
 52 |     mean = np.mean(m, axis=0)
 53 |     std_var = np.std(m, axis=0)
 54 |     return (m - mean) / std_var
 55 | 
 56 | def add_noise_for_waveform(s, n, db):
 57 |     """
 58 |     为语音文件叠加噪声
 59 |     ----
 60 |     para:
 61 |         s：原语音的时域信号
 62 |         n：噪声的时域信号
 63 |         db：信噪比
 64 |     ----
 65 |     return:
 66 |         叠加噪声后的语音
 67 |     """
 68 |     alpha = np.sqrt(
 69 |         np.sum(s ** 2) / (np.sum(n ** 2) * 10 ** (db / 10))
 70 |     )
 71 |     mix = s + alpha * n
 72 |     return mix
 73 | 
 74 | 
 75 | def prepare_empty_dirs(dirs: list):
 76 |     """
 77 |     建立空目录。若已经存在，则删除后创建。
 78 |     parents=True
 79 | 
 80 |     Args:
 81 |         dirs: Path list
 82 | 
 83 |     Returns:
 84 |         dirs 中各个目录的句柄
 85 |     """
 86 |     result = []
 87 |     for d in dirs:
 88 |         if d.exists():
 89 |             shutil.rmtree(d.as_posix())
 90 |         d.mkdir(parents=True, exist_ok=False)
 91 |         result.append(d)
 92 |     return result
 93 | 
 94 | 
 95 | def load_wavs(file_paths, sr=16000, min_sampling=0):
 96 |     """
 97 |     根据 file_paths 逐个加载 wav 文件
 98 | 
 99 |     可以指定：
100 |     - wav 文件需要满足的最小采样点数
101 |     - 需要加载的 wav 文件数量，直到遍历完整个 list 或 满足了 limit 指定的数量要求
102 | 
103 |     Args:
104 |         file_paths: 候选集合，其中采样点数大于 minimum_sampling 的 wav 才能被加载成功
105 |         limit: 要求加载的数量上限
106 |         sr: 采样率
107 |         min_sampling: 最小采样点数
108 |     """
109 |     wavs = []
110 |     actual_num = 0
111 | 
112 |     for i, path in tqdm(enumerate(file_paths), desc="Loading wavs ..."):
113 |         wav, _ = librosa.load(path, sr=sr)
114 |         if len(wav) >= min_sampling:
115 |             wavs.append(wav)
116 |             actual_num += 1
117 |         else:
118 |             print(f"The length of {file_paths[i]} < min sampling ...")
119 | 
120 |     print(f"需加载 wav 文件数量为：{len(file_paths)}")
121 |     print(f"实际加载 wav 文件数量为：{actual_num}")
122 |     return wavs
123 | 
124 | def unfold_spectrum(spec, n_pad=3):
125 |     """
126 |     对频谱应用滑窗操作
127 | 
128 |     Args:
129 |         spec (np.array): 频谱，(n_fft, T)
130 |         n_pad (int): 输入帧 pad 的大小 (default: 3，即左边 3 帧，右边也是 3 帧)
131 | 
132 |     Returns:
133 |         np.array -- 拓展过频谱，尺寸为 (n_fft, T * (n_pad * 2 + 1))
134 |     """
135 |     # 补齐频谱左侧后右侧
136 |     left_pad_spec = np.repeat(spec[:, 0].reshape(-1, 1), n_pad, axis=1)  # (257, 3)
137 |     right_pad_spec = np.repeat(spec[:, -1].reshape(-1, 1), n_pad, axis=1)  # (257, 3)
138 |     assert left_pad_spec.shape[-1] == right_pad_spec.shape[-1] == n_pad
139 |     spec = np.concatenate([left_pad_spec, spec, right_pad_spec], axis=1).T  # (120, 257)
140 |     spec = torch.Tensor(spec)
141 | 
142 |     # 类似于滑窗的效果，窗大小为 2*n_pad+1，每次滑动的间隔为 1
143 |     spec_list = spec.unfold(0, 2 * n_pad + 1, 1)  # [tensor(257, 7), tensor(257, 7), ...], len = 114
144 |     spec = torch.cat(tuple(spec_list), dim=1).numpy()  # (257, 798)
145 | 
146 |     return spec
147 | 


--------------------------------------------------------------------------------